diff --git a/.Rbuildignore b/.Rbuildignore index 3b6f456cb..4b3815dd8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,16 +2,14 @@ ^renv\.lock$ ^.*\.Rproj$ ^\.Rproj\.user$ -.travis.yml standalone -^\.travis\.yml$ deploy.sh extras/* man-roxygen -xgboost compare_versions .github docs/* _pkgdown.yml - - +^vignettes/articles$ +^doc$ +^Meta$ diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index f85ed6abb..a0baf21b2 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -22,7 +22,7 @@ jobs: config: - {os: windows-latest, r: 'release'} # Does not appear to have Java 32-bit, hence the --no-multiarch - {os: macOS-latest, r: 'release'} - - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-22.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"} env: GITHUB_PAT: ${{ secrets.GH_TOKEN }} @@ -45,98 +45,56 @@ jobs: CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }} steps: - - uses: actions/checkout@v2 - - - uses: conda-incubator/setup-miniconda@v2 - with: - python-version: "3.7" - activate-environment: r-reticulate - - - name: python main dependencies - run: conda install --name r-reticulate numpy scipy scikit-learn pandas pydotplus joblib - - - name: python json dependencies - run: conda install --name r-reticulate -c conda-forge sklearn-json - - - name: python scikit-survival dependencies - run: conda install --name r-reticulate -c sebp scikit-survival + - uses: actions/checkout@v4 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} - - uses: r-lib/actions/setup-tinytex@v1 + - uses: r-lib/actions/setup-tinytex@v2 - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Cache R packages - if: runner.os != 'Windows' - uses: actions/cache@v2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-3-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-3- - - - name: Install system dependencies + - name: Install system requirements if: runner.os == 'Linux' run: | + sudo apt-get install -y libssh-dev + Rscript -e 'install.packages("remotes")' while read -r cmd do eval sudo $cmd - done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - - - name: Install dependencies - run: | - remotes::install_deps(dependencies = TRUE, INSTALL_opts=c("--no-multiarch")) - remotes::install_cran("rcmdcheck") - shell: Rscript {0} - - - name: Install covr - if: runner.os == 'Windows' - run: | - remotes::install_cran("covr") - shell: Rscript {0} - - - name: Remove check folder if exists - if: runner.os == 'macOS' - run: unlink("check", recursive = TRUE) - shell: Rscript {0} - - - name: use r-reticulate environment - run: | - reticulate::use_condaenv("r-reticulate", required = TRUE) - shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_REMOTE_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran", "--no-multiarch"), error_on = "warning", check_dir = "check") - shell: Rscript {0} + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "22.04"))') + + - name: Setup conda + uses: conda-incubator/setup-miniconda@v3 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check - - name: Upload check results - if: failure() - uses: actions/upload-artifact@main + - uses: r-lib/actions/check-r-package@v2 with: - name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check + args: 'c("--no-manual", "--as-cran")' + error-on: '"warning"' + check-dir: '"check"' - name: Upload source package if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: package_tarball path: check/*.tar.gz - + + - name: Install covr + if: runner.os == 'Linux' + run: | + remotes::install_cran("covr") + shell: Rscript {0} + - name: Test coverage - if: runner.os == 'Windows' - run: covr::codecov() + if: runner.os == 'Linux' + run: covr::codecov(token = "${{ secrets.CODECOV_TOKEN }}") shell: Rscript {0} @@ -152,7 +110,7 @@ jobs: steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -178,7 +136,7 @@ jobs: draft: false prerelease: false - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 if: ${{ env.new_version != '' }} - name: Install drat @@ -194,7 +152,7 @@ jobs: - name: Download package tarball if: ${{ env.new_version != '' }} - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: package_tarball diff --git a/.github/workflows/R_CMD_check_main_weekly.yaml b/.github/workflows/R_CMD_check_main_weekly.yaml new file mode 100644 index 000000000..45bbb548b --- /dev/null +++ b/.github/workflows/R_CMD_check_main_weekly.yaml @@ -0,0 +1,72 @@ +on: + schedule: + - cron: '0 12 * * 6' # every Saturday at noon UTC + + workflow_dispatch: + +name: 'R check' + +jobs: + R-CMD-check-main: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macOS-latest, r: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM_SCHEMA }} + CDM5_ORACLE_OHDSI_SCHEMA: ${{ secrets.CDM5_ORACLE_OHDSI_SCHEMA }} + CDM5_ORACLE_PASSWORD: ${{ secrets.CDM5_ORACLE_PASSWORD }} + CDM5_ORACLE_SERVER: ${{ secrets.CDM5_ORACLE_SERVER }} + CDM5_ORACLE_USER: ${{ secrets.CDM5_ORACLE_USER }} + CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM_SCHEMA }} + CDM5_POSTGRESQL_OHDSI_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_OHDSI_SCHEMA }} + CDM5_POSTGRESQL_PASSWORD: ${{ secrets.CDM5_POSTGRESQL_PASSWORD }} + CDM5_POSTGRESQL_SERVER: ${{ secrets.CDM5_POSTGRESQL_SERVER }} + CDM5_POSTGRESQL_USER: ${{ secrets.CDM5_POSTGRESQL_USER }} + CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM_SCHEMA }} + CDM5_SQL_SERVER_OHDSI_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_OHDSI_SCHEMA }} + CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }} + CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }} + CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }} + CDM5_REDSHIFT_CDM_SCHEMA: ${{ secrets.CDM5_REDSHIFT_CDM_SCHEMA }} + CDM5_REDSHIFT_OHDSI_SCHEMA: ${{ secrets.CDM5_REDSHIFT_OHDSI_SCHEMA }} + CDM5_REDSHIFT_PASSWORD: ${{ secrets.CDM5_REDSHIFT_PASSWORD }} + CDM5_REDSHIFT_SERVER: ${{ secrets.CDM5_REDSHIFT_SERVER }} + CDM5_REDSHIFT_USER: ${{ secrets.CDM5_REDSHIFT_USER }} + CDM5_SPARK_USER: ${{ secrets.CDM5_SPARK_USER }} + CDM5_SPARK_PASSWORD: ${{ secrets.CDM5_SPARK_PASSWORD }} + CDM5_SPARK_CONNECTION_STRING: ${{ secrets.CDM5_SPARK_CONNECTION_STRING }} + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-tinytex@v2 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - name: Setup conda + uses: conda-incubator/setup-miniconda@v3 + + - uses: r-lib/actions/check-r-package@v2 + with: + args: 'c("--no-manual", "--as-cran")' + error-on: '"warning"' + check-dir: '"check"' diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 000000000..7a5e8ac76 --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,46 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, develop] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, ohdsi/OhdsiRTools + needs: website + + - name: Build site + run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' + + - name: Fix Hades Logo + run: Rscript -e 'OhdsiRTools::fixHadesLogo()' + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@4.1.4 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore index c7ce48620..cb6091781 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ *-Ex.R # R data files from past sessions .Rdata +# R environ +.Renviron # RStudio files .Rproj.user/ .Rproj.user @@ -20,4 +22,6 @@ standalone/build/* /plpmodels/* /python_models/* /mycache/* -/inst/shiny/DiagnosticsExplorer/rsconnect/* \ No newline at end of file +/inst/shiny/DiagnosticsExplorer/rsconnect/* +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 75919f642..0011aef4b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,17 +2,18 @@ Package: PatientLevelPrediction Type: Package Title: Developing patient level prediction using data in the OMOP Common Data Model -Version: 5.0.5 -Date: 2022-03-16 +Version: 6.3.9 +Date: 2024-08-21 Authors@R: c( person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")), person("Martijn", "Schuemie", role = c("aut")), person("Marc", "Suchard", role = c("aut")), person("Patrick", "Ryan", role = c("aut")), - person("Peter", "Rijnbeek", role = c("aut")) + person("Peter", "Rijnbeek", role = c("aut")), + person("Egill", "Fridgeirsson", role = c("aut")) ) Maintainer: Jenna Reps -Description: A package for creating patient level prediction models. Given a +Description: A user friendly way to create patient level prediction models using the OMOP common data model. Given a cohort of interest and an outcome of interest, the package can use data in the OMOP Common Data Model to build a large set of features. These features can then be assessed to fit a predictive model using a number of machine learning algorithms. @@ -22,69 +23,56 @@ URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/Pa BugReports: https://github.com/OHDSI/PatientLevelPrediction/issues VignetteBuilder: knitr Depends: - R (>= 3.3.0), - FeatureExtraction (>= 3.0.0) + R (>= 4.0.0) Imports: Andromeda, Cyclops (>= 3.0.0), - DatabaseConnector (>= 5.0.0), + DatabaseConnector (>= 6.0.0), dplyr, + FeatureExtraction (>= 3.0.0), ggplot2, gridExtra, - Hmisc, - jsonlite, Matrix, memuse, + mgcv, ParallelLogger (>= 2.0.0), + polspline, pROC, PRROC, - reticulate (> 1.16), + reticulate (>= 1.30), rlang, - rms, - RSQLite, SqlRender (>= 1.1.3), survival, - tibble, tidyr, utils Suggests: AUC, BigKnn (>= 1.0.0), - DBI, devtools, - diagram, - DT, Eunomia, - gnm, - htmlwidgets (> 0.8), - htmltools, IterativeHardThresholding, knitr, markdown, Metrics, parallel, - plotly, plyr, pool, - R6, + readr, ResourceSelection, + ResultModelManager (>= 0.2.0), rmarkdown, + RSQLite, scoring, - shiny, - shinycssloaders, - shinydashboard, - shinyWidgets, - SparseM, - survAUC, + ShinyAppBuilder (>= 1.1.1), survminer, testthat, withr, - xgboost (> 1.3.2.1) + xgboost (> 1.3.2.1), + lightgbm Remotes: ohdsi/BigKnn, - ohdsi/Eunomia, ohdsi/FeatureExtraction, - ohdsi/IterativeHardThresholding, - ohdsi/ParallelLogger -RoxygenNote: 7.1.2 + ohdsi/ShinyAppBuilder, + ohdsi/ResultModelManager, +RoxygenNote: 7.3.1 Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index 3aed99048..38cfb743f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,14 +3,22 @@ S3method(print,plpData) S3method(print,summary.plpData) S3method(summary,plpData) +export(MapIds) +export(addDiagnosePlpToDatabase) +export(addMultipleDiagnosePlpToDatabase) +export(addMultipleRunPlpToDatabase) +export(addRunPlpToDatabase) export(averagePrecision) export(brierScore) export(calibrationLine) export(computeAuc) +export(computeGridPerformance) export(configurePython) export(covariateSummary) export(createCohortCovariateSettings) export(createDatabaseDetails) +export(createDatabaseList) +export(createDatabaseSchemaSettings) export(createDefaultExecuteSettings) export(createDefaultSplitSetting) export(createExecuteSettings) @@ -23,13 +31,19 @@ export(createPreprocessSettings) export(createRandomForestFeatureSelection) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSplineSettings) +export(createStratifiedImputationSettings) export(createStudyPopulation) export(createStudyPopulationSettings) +export(createTempModelLoc) export(createUnivariateFeatureSelection) +export(createValidationDesign) export(createValidationSettings) -export(diagnostic) +export(diagnoseMultiplePlp) +export(diagnosePlp) export(evaluatePlp) export(externalValidateDbPlp) +export(extractDatabaseToCsv) export(fitPlp) export(getCalibrationSummary) export(getCohortCovariateData) @@ -38,14 +52,18 @@ export(getPlpData) export(getPredictionDistribution) export(getThresholdSummary) export(ici) -export(launchDiagnosticsExplorer) +export(insertCsvToDatabase) +export(insertModelDesignInDatabase) +export(insertResultsToSqlite) export(listAppend) +export(listCartesian) export(loadPlpAnalysesJson) export(loadPlpData) export(loadPlpModel) export(loadPlpResult) export(loadPlpShareable) export(loadPrediction) +export(migrateDataModel) export(modelBasedConcordance) export(outcomeSurvivalPlot) export(pfi) @@ -63,7 +81,6 @@ export(plotSparseCalibration) export(plotSparseCalibration2) export(plotSparseRoc) export(plotVariableScatterplot) -export(populatePlpResultTables) export(predictCyclops) export(predictPlp) export(recalibratePlp) @@ -83,18 +100,21 @@ export(setGradientBoostingMachine) export(setIterativeHardThresholding) export(setKNN) export(setLassoLogisticRegression) +export(setLightGBM) export(setMLP) export(setNaiveBayes) export(setPythonEnvironment) export(setRandomForest) export(setSVM) export(simulatePlpData) +export(sklearnFromJson) +export(sklearnToJson) export(splitData) export(toSparseM) +export(validateExternal) export(validateMultiplePlp) export(viewDatabaseResultPlp) export(viewMultiplePlp) export(viewPlp) -import(FeatureExtraction) importFrom(dplyr,"%>%") importFrom(rlang,.data) diff --git a/NEWS.md b/NEWS.md index f0eedf6cc..4de122e95 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,196 @@ +PatientLevelPrediction 6.3.9 +====================== +- Hotfix adding schema to DatabaseConnector::getTableNames when creating results tables + +PatientLevelPrediction 6.3.8 +====================== +- Add support for R4.4 +- Fix notes around documentation (vignette engine and brackets in itemize) +- Use webp image format where possible (not in pdfs) for smaller size +- Make sure random table names are unique in tests +- Remove remote info for Eunomia since it's in CRAN + +PatientLevelPrediction 6.3.7 +====================== +- Clean up dependencies, tibble removed and IHT and ParallelLogger from CRAN +- Use cohortIds for cohortCovariates to comply with FeatureExtraction +- Add cdmDatabaseName from DatabaseDetails to model output +- Fix bug when attributes weren't preserved on trainData$covariateData after split +- Fix warnings in tests and speed them up +- Fix bug in assignment operator in configurePython +- Delay evaluation of plpData when using do.call like in learningCurves and +runMultiplePlp +- Speed up population generation when subjectId's are distinct +- Fix bug when population was still generated when provided to runPlp + +PatientLevelPrediction 6.3.6 +====================== +- fix bug with ohdsi shiny modules version check (issue 415) + +PatientLevelPrediction 6.3.5 +====================== +- Fix sklearnToJson to be compatible with scikit-learn>=1.3 +- Fix github actions so it's not hardcoded to use python 3.7 + + +PatientLevelPrediction 6.3.4 +====================== +- added spline feature engineering +- added age/sex stratified imputation feature engineering +- changed result table execution date types to varchar +- updated covariateSummary to use feature engineering + +PatientLevelPrediction 6.3.3 +====================== +- fixed bug introduced with new reticulate update in model saving to json tests + + +PatientLevelPrediction 6.3.2 +====================== +- fixed bug with database insert if result is incomplete +- updated/fixed documentation (Egill) +- added model path to models (Henrik) +- updated hyper-parameter saving to data.frame and made consistent + +PatientLevelPrediction 6.3.1 +====================== +- fixed bug with multiple covariate settings in diagnose plp +- added min cell count when exporting database results to csv files +- light GBM added (thanks Jin Choi and Chungsoo Kim) +- fixed minor bugs when uploading results to database + +PatientLevelPrediction 6.2.1 +====================== +- added ensure_installed("ResultModelManager") to getDataMigrator() + +PatientLevelPrediction 6.1.0 +====================== +- shiny app is now using ShinyAppBuilder with a config saved in the /inst folder + +PatientLevelPrediction 6.0.11 +====================== +- fixed bugs introduced when sklearn inputs changed +- added sklearn model being saved as jsons +- made changes around the DatabaseConnection get table names function to make it work for the updated DatabaseConnection +- removed check RAM stop (now it just warns) + +PatientLevelPrediction 6.0.10 +====================== +- Updated test to skip test for FE setting if the model does not fit (this was causing occasional test fail) +- replaced .data$ with "" for all dplyr::select to remove warnings + +PatientLevelPrediction 6.0.9 +====================== +- Fix bug with python type being required to be int + +PatientLevelPrediction 6.0.8 +====================== +- Allow priorType to be passed down to getCV function in case prior is not 'laplace' +- Seed specified in Cyclops model wasn't passed to Cyclops + +PatientLevelPrediction 6.0.7 +====================== +- fixed issue with shiny viewer converting connection details to large json + +PatientLevelPrediction 6.0.6 +====================== +- added check for cdmDatabaseId into createDatabaseDetails +- added test for check for cdmDatabaseId into createDatabaseDetails to error when NULL +- removed session$onSessionEnded(shiny::stopApp) from shiny server + +PatientLevelPrediction 6.0.5 +====================== +- fixing cox predictions + +PatientLevelPrediction 6.0.4 +====================== +- forcing cdmDatabaseId to be a string if integer is input + +PatientLevelPrediction 6.0.3 +====================== +- replaced utils::read.csv with readr::read_csv when inserting results from csv + +PatientLevelPrediction 6.0.2 +====================== +- replaced gsub with sub when inserting csvs to database + +PatientLevelPrediction 6.0.1 +====================== +- saved result specification csv in windows to fix odd formating issue + +PatientLevelPrediction 6.0.0 +====================== +- fixed sample data bugs +- updated to use v1.0.0 of OhdsiShinyModules +- updated plp database result tables to use the same structure for cohort and database as other HADES packages +- added function to insert csv results into plp database result tables +- added input for databaseId (database and version) when extracting data to be consistent with other HADES packages. This is saved in plp objects. + +PatientLevelPrediction 5.4.4 +====================== +- fixed issue with 'preprocess' vs 'preprocessing' inconsistently used across models +- added metaData tracking for feature engineering or preprocessing when predicting +- fixed issue with FE using trainData$covariateData metaData rather than trainData +- fixed bug when using sameData for FE + +PatientLevelPrediction 5.4.3 +====================== +- pulled in multiple bug fixes and test improvements from Egill +- pulled in fix for learning curves from Henrik +- Pulled in fix for feature engineering from Solomon +- Cleaned check messages about comparing class(x) with a string by changing to inherits() + +PatientLevelPrediction 5.4.2 +====================== +- removed json saving for sklearn models since sklearn-json is no longer working for the latest sklearn + + +PatientLevelPrediction 5.4.1 +====================== +- renamed the input corresponding to the string that gets appended to the results table names to tablePrefix +- fixed issues with system.file() from SqlRender code breaking the tests +- added an input fileAppend to the function that exports the database tables to csv files +- moved the plp model (including preprocessing details) outside of the result database (into a specified folder) due to the size of the objects (too large to insert into the database). + +PatientLevelPrediction 5.4.0 +====================== +- added saving of plp models into the result database +- added default cohortDefinitions in runMultiplePlp + +PatientLevelPrediction 5.3.3 +====================== +- added modelType to all models for database upload + +PatientLevelPrediction 5.3.2 +====================== +- moved FeatureExtraction to depends +- fixed using inherits() + +PatientLevelPrediction 5.3.1 +====================== +- moved most of the shiny app code into OhdsiShinyModules +- removed shiny dependencies and added OhdsiShinyModules to suggests +- fixed bug with linux sklearn saving + +PatientLevelPrediction 5.1.1 +====================== +- replaced cohortId to targetId for consistency throughout code + +PatientLevelPrediction 5.1.0 +====================== +- replaced targetId in model design to cohortId for consistency throughout code +- replaced plpDataSettings to restrictPlpDataSettings to improve naming consistency +- added ability to use initial population in runPlp by adding the population to plpData$population +- added splitSettings into modelDesign +- replaced saving json settings with ParallelLogger function +- updated database result schema (removed researcher_id from tables - if desired a new table with the setting_ids and researcher_id could be added, removed study tables and revised results table to performances table with a reference to model_design_id and development_database_id to enable validation results without a model to be inserted) +- added diagnostic code based on PROBAST +- added diagnostic shiny module +- added code to create sqlite database and populate in uploadToDatabase +- add code to convert runPlp+val to sqlite database when viewing shiny +- added code to extract database results into csv files: extractDatabaseToCsv() + + PatientLevelPrediction 5.0.5 ====================== - pulled in GBM update (default hyper-parameters and variable importance fix) work done by Egill (egillax) diff --git a/R/AdditionalCovariates.R b/R/AdditionalCovariates.R index 2548eaf12..f19f30803 100644 --- a/R/AdditionalCovariates.R +++ b/R/AdditionalCovariates.R @@ -30,8 +30,9 @@ #' @param cohortTable the table name that contains the target population cohort #' @param rowIdField string representing the unique identifier in the target population cohort #' @param aggregated whether the covariate should be aggregated -#' @param cohortId cohort id for the target population cohort +#' @param cohortIds cohort id for the target cohort #' @param covariateSettings settings for the covariate cohorts and time periods +#' @param ... additional arguments from FeatureExtraction #' #' @return #' The models will now be in the package @@ -45,8 +46,9 @@ getCohortCovariateData <- function( cohortTable = "#cohort_person", rowIdField = "row_id", aggregated, - cohortId, - covariateSettings + cohortIds, + covariateSettings, + ... ){ # Some SQL to construct the covariate: @@ -58,23 +60,23 @@ getCohortCovariateData <- function( "}} as covariate_value", "from @cohort_temp_table a inner join @covariate_cohort_schema.@covariate_cohort_table b", " on a.subject_id = b.subject_id and ", - " b.cohort_start_date <= dateadd(day, @endDay, a.cohort_start_date) and ", + " b.cohort_start_date <= dateadd(day, @endDays, a.cohort_start_date) and ", " b.cohort_end_date >= dateadd(day, @startDay, a.cohort_start_date) ", "{@ageInteraction | @lnAgeInteraction}?{inner join @cdm_database_schema.person p on p.person_id=a.subject_id}", "where b.cohort_definition_id = @covariate_cohort_id - group by a.@row_id_field " + group by a.@row_id_field; " ) sql <- SqlRender::render( sql, covariate_cohort_schema = covariateSettings$cohortDatabaseSchema, covariate_cohort_table = covariateSettings$cohortTable, - covariate_cohort_id = covariateSettings$cohortId, + covariate_cohort_id = covariateSettings$cohortIds, cohort_temp_table = cohortTable, row_id_field = rowIdField, startDay = covariateSettings$startDay, covariate_id = covariateSettings$covariateId, - endDay = covariateSettings$endDay, + endDays = covariateSettings$endDays, countval = covariateSettings$count, ageInteraction = covariateSettings$ageInteraction, lnAgeInteraction = covariateSettings$lnAgeInteraction, @@ -94,7 +96,7 @@ getCohortCovariateData <- function( colnames(covariates) <- SqlRender::snakeCaseToCamelCase(colnames(covariates)) # Construct covariate reference: sql <- "select @covariate_id as covariate_id, '@concept_set' as covariate_name, - @analysis_id as analysis_id, -1 as concept_id" + @analysis_id as analysis_id, -1 as concept_id;" sql <- SqlRender::render( sql = sql, covariate_id = covariateSettings$covariateId, @@ -102,7 +104,7 @@ getCohortCovariateData <- function( concept_set = paste('Cohort_covariate during day', covariateSettings$startDay, 'through', - covariateSettings$endDay, + covariateSettings$endDays, 'days relative to index:', ifelse(covariateSettings$count, 'Number of', ''), covariateSettings$covariateName, @@ -191,9 +193,9 @@ createCohortCovariateSettings <- function( covariateId = cohortId*100000+settingId*1000+analysisId, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = cohortTable, - cohortId = cohortId, + cohortIds = cohortId, startDay = startDay, - endDay = endDay, + endDays = endDay, count = count, ageInteraction = ageInteraction, lnAgeInteraction = lnAgeInteraction, diff --git a/R/AndromedaHelperFunctions.R b/R/AndromedaHelperFunctions.R index 7a97b41b3..b6182610c 100644 --- a/R/AndromedaHelperFunctions.R +++ b/R/AndromedaHelperFunctions.R @@ -55,40 +55,28 @@ batchRestrict <- function(covariateData, population, sizeN = 10000000){ newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef, analysisRef = covariateData$analysisRef) - maxRows <- RSQLite::dbGetQuery(covariateData, - "SELECT count(*) as n FROM covariates;") + Andromeda::batchApply(covariateData$covariates, function(tempData) { - steps <- ceiling(maxRows$n/sizeN) - - - pb <- utils::txtProgressBar(style = 3) - - for(i in 1:steps){ - utils::setTxtProgressBar(pb, i/steps) - - offset <- ((i-1)*sizeN) - limit <- sizeN + filtered <- dplyr::inner_join(tempData, population, by = 'rowId') - tempData <- RSQLite::dbGetQuery(covariateData, - paste0("SELECT * FROM covariates LIMIT ",limit," OFFSET ",offset," ;")) - - filtered <- tempData %>% dplyr::inner_join(population, by = 'rowId') - - if(i==1){ + if ("covariates" %in% names(newCovariateData)) { + Andromeda::appendToTable(newCovariateData$covariates, data = filtered) + } else { newCovariateData$covariates <- filtered - } else{ - Andromeda::appendToTable(tbl = newCovariateData$covariates, - data = filtered) } - } - close(pb) + }, + progressBar = TRUE, + batchSize = sizeN) - Andromeda::createIndex(tbl = newCovariateData$covariates, columnNames = 'covariateId', - indexName = 'covariates_ncovariateIds') - Andromeda::createIndex(tbl = newCovariateData$covariates, c('rowId'), - indexName = 'covariates_rowId') - Andromeda::createIndex(tbl = newCovariateData$covariates, c('covariateId', 'covariateValue'), - indexName = 'covariates_covariateId_value') + Andromeda::createIndex(tbl = newCovariateData$covariates, + columnNames = 'covariateId', + indexName = 'covariates_ncovariateIds') + Andromeda::createIndex(tbl = newCovariateData$covariates, + columnNames = 'rowId', + indexName = 'covariates_rowId') + Andromeda::createIndex(tbl = newCovariateData$covariates, + columnNames = c('covariateId', 'covariateValue'), + indexName = 'covariates_covariateId_value') metaData$populationSize <- nrow(population) attr(newCovariateData, 'metaData') <- metaData @@ -109,8 +97,9 @@ calculatePrevs <- function(plpData, population){ #=========================== # add population to sqllite - population <- tibble::as_tibble(population) - plpData$covariateData$population <- population %>% dplyr::select(.data$rowId, .data$outcomeCount) + population <- dplyr::as_tibble(population) + plpData$covariateData$population <- population %>% + dplyr::select("rowId", "outcomeCount") outCount <- nrow(plpData$covariateData$population %>% dplyr::filter(.data$outcomeCount == 1)) nonOutCount <- nrow(plpData$covariateData$population %>% dplyr::filter(.data$outcomeCount == 0)) @@ -120,7 +109,7 @@ calculatePrevs <- function(plpData, population){ dplyr::group_by(.data$covariateId) %>% dplyr::summarise(prev.out = 1.0*sum(.data$outcomeCount==1, na.rm = TRUE)/outCount, prev.noout = 1.0*sum(.data$outcomeCount==0, na.rm = TRUE)/nonOutCount) %>% - dplyr::select(.data$covariateId, .data$prev.out, .data$prev.noout) + dplyr::select("covariateId", "prev.out", "prev.noout") #clear up data ##plpData$covariateData$population <- NULL diff --git a/R/CalibrationSummary.R b/R/CalibrationSummary.R index fe8d297c2..e8094350b 100644 --- a/R/CalibrationSummary.R +++ b/R/CalibrationSummary.R @@ -146,7 +146,7 @@ getCalibrationSummary_survival <- function( } else{ gval <- numberOfStrata } - groups<-Hmisc::cut2(predictionOfInterest$value,g=gval) + groups<-cut2(predictionOfInterest$value,g=gval) n.groups<-length(levels(groups)) pred<-tapply(predictionOfInterest$value,groups,mean) sizesN<-tapply(predictionOfInterest$value,groups,length) @@ -199,3 +199,4 @@ getCalibrationSummary_survival <- function( return(result) } + diff --git a/R/CovariateSummary.R b/R/CovariateSummary.R index b344caa7d..6972441a5 100644 --- a/R/CovariateSummary.R +++ b/R/CovariateSummary.R @@ -66,6 +66,28 @@ covariateSummary <- function( strata = strata ) + # apply feature engineering + if(!is.null(featureEngineering)){ + + # create copy of covariateData + newCovariateData <- Andromeda::andromeda( + covariateRef = covariateData$covariateRef, + analysisRef = covariateData$analysisRef, + covariates = covariateData$covariates + ) + covariateData <- newCovariateData + + if(!is.null(featureEngineering$funct)){ + featureEngineering <- list(featureEngineering) + } + + for(fe in featureEngineering){ + feSettings <- fe$settings + feSettings$trainData = list(covariateData = covariateData) + covariateData <- do.call(fe$funct, feSettings)$covariateData + } + } + # make this run in parallel for big speed improvements.. covariateSummariesPerStrata <- lapply(subsetList, function(x){ @@ -114,10 +136,10 @@ aggregateCovariateSummaries <- function( ParallelLogger::logInfo('Aggregating with no labels or strata') result <- covariateSummariesPerStrata %>% dplyr::select( - .data$covariateId, - .data$CovariateCount, - .data$CovariateMean, - .data$CovariateStDev, + "covariateId", + "CovariateCount", + "CovariateMean", + "CovariateStDev", ) } @@ -126,11 +148,11 @@ aggregateCovariateSummaries <- function( ParallelLogger::logInfo('Aggregating with only labels or strata') resultLabels <- covariateSummariesPerStrata %>% dplyr::select( - .data$group, - .data$covariateId, - .data$CovariateCount, - .data$CovariateMean, - .data$CovariateStDev, + "group", + "covariateId", + "CovariateCount", + "CovariateMean", + "CovariateStDev", ) resultLabels <- tidyr::pivot_longer( @@ -142,7 +164,7 @@ aggregateCovariateSummaries <- function( resultLabels <- resultLabels %>% dplyr::mutate(group_variable = paste(.data$group, .data$variable, sep ='_')) %>% - dplyr::select(-.data$group, -.data$variable) + dplyr::select(-"group", -"variable") resultLabels <- tidyr::pivot_wider( data = resultLabels, @@ -176,11 +198,11 @@ aggregateCovariateSummaries <- function( # labels and strata resultLabelStratas <- covariateSummariesPerStrata %>% dplyr::select( - .data$group, - .data$covariateId, - .data$CovariateCount, - .data$CovariateMean, - .data$CovariateStDev, + "group", + "covariateId", + "CovariateCount", + "CovariateMean", + "CovariateStDev", ) resultLabelStratas <- tidyr::pivot_longer( @@ -192,7 +214,7 @@ aggregateCovariateSummaries <- function( resultLabelStratas <- resultLabelStratas %>% dplyr::mutate(group_variable = paste(.data$group, .data$variable, sep ='_')) %>% - dplyr::select(-.data$group, -.data$variable) + dplyr::select(-"group", -"variable") resultLabelStratas <- tidyr::pivot_wider( data = resultLabelStratas, @@ -220,11 +242,11 @@ aggregateCovariateSummaries <- function( CovariateStDev = sqrt(sum(.data$sumSquares)/sum(.data$N) - (sum(.data$sumVal)/sum(.data$N))^2 ) ) %>% dplyr::select( - .data$groupLabel, - .data$covariateId, - .data$CovariateCount, - .data$CovariateMean, - .data$CovariateStDev + "groupLabel", + "covariateId", + "CovariateCount", + "CovariateMean", + "CovariateStDev" ) resultLabels <- tidyr::pivot_longer( @@ -236,7 +258,7 @@ aggregateCovariateSummaries <- function( resultLabels <- resultLabels %>% dplyr::mutate(group_variable = paste(.data$groupLabel, .data$variable, sep ='_')) %>% - dplyr::select(-.data$groupLabel, -.data$variable) + dplyr::select(-"groupLabel", -"variable") resultLabels <- tidyr::pivot_wider( data = resultLabels, @@ -304,7 +326,7 @@ createCovariateSubsets <- function( ParallelLogger::logInfo(paste0('calculating subset of strata ',i)) subset <- cohort %>% dplyr::filter(.data$finalStrata == finalStratas[[i]]) %>% - dplyr::select(.data$rowId) + dplyr::select("rowId") result[[i]] <- list( subset = subset, diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 54f6065f8..16f035164 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -19,11 +19,13 @@ fitCyclopsModel <- function( trainData, - param, + modelSettings, # old:param, search='adaptive', analysisId, ...){ + param <- modelSettings$param + # check plpData is coo format: if (!FeatureExtraction::isCovariateData(trainData$covariateData)){ stop("Needs correct covariateData") @@ -36,11 +38,26 @@ fitCyclopsModel <- function( y = sapply(.data$outcomeCount, function(x) min(1,x)), time = .data$survivalTime ) - - covariates <- filterCovariateIds(param, trainData$covariateData) - start <- Sys.time() + covariates <- filterCovariateIds(param, trainData$covariateData) + if (!is.null(param$priorCoefs)) { + sourceCoefs <- param$priorCoefs %>% + dplyr::filter(abs(.data$betas)>0 & .data$covariateIds != "(Intercept)") + + newCovariates <- covariates %>% + dplyr::filter(.data$covariateId %in% !!sourceCoefs$covariateIds) %>% + dplyr::mutate(newCovariateId = .data$covariateId*-1) %>% + dplyr::select(-"covariateId") %>% + dplyr::rename(covariateId = .data$newCovariateId) %>% + dplyr::collect() + + Andromeda::appendToTable(covariates, newCovariates) + + } + + start <- Sys.time() + cyclopsData <- Cyclops::convertToCyclopsData( outcomes = trainData$covariateData$labels, covariates = covariates, @@ -50,6 +67,20 @@ fitCyclopsModel <- function( normalize = NULL, quiet = TRUE ) + + if (!is.null(param$priorCoefs)) { + fixedCoefficients <- c(FALSE, + rep(TRUE, nrow(sourceCoefs)), + rep(FALSE, length(cyclopsData$coefficientNames)-(nrow(sourceCoefs)+1))) + + startingCoefficients <- rep(0, length(fixedCoefficients)) + + # skip intercept index + startingCoefficients[2:(nrow(sourceCoefs)+1)] <- sourceCoefs$betas + } else { + startingCoefficients <- NULL + fixedCoefficients <- NULL + } if(settings$crossValidationInPrior){ param$priorParams$useCrossValidation <- max(trainData$folds$index)>1 @@ -69,7 +100,8 @@ fitCyclopsModel <- function( selectorType = settings$selectorType, noiseLevel = "silent", threads = settings$threads, - maxIterations = settings$maxIterations + maxIterations = settings$maxIterations, + seed = settings$seed ) fit <- tryCatch({ @@ -77,7 +109,9 @@ fitCyclopsModel <- function( Cyclops::fitCyclopsModel( cyclopsData = cyclopsData, prior = prior, - control = control + control = control, + fixedCoefficients = fixedCoefficients, + startingCoefficients = startingCoefficients )}, finally = ParallelLogger::logInfo('Done.') ) @@ -88,15 +122,21 @@ fitCyclopsModel <- function( finally = ParallelLogger::logInfo('Done.')) } + modelTrained <- createCyclopsModel( fit = fit, modelType = settings$modelType, useCrossValidation = max(trainData$folds$index)>1, cyclopsData = cyclopsData, labels = trainData$covariateData$labels, - folds = trainData$folds + folds = trainData$folds, + priorType = param$priorParams$priorType ) - + + if (!is.null(param$priorCoefs)) { + modelTrained$coefficients <- reparamTransferCoefs(modelTrained$coefficients) + } + # TODO get optimal lambda value ParallelLogger::logTrace('Returned from fitting to LassoLogisticRegression') comp <- Sys.time() - start @@ -116,7 +156,7 @@ fitCyclopsModel <- function( prediction$evaluationType <- 'Train' # get cv AUC if exists - cvPerFold <- c() + cvPerFold <- data.frame() if(!is.null(modelTrained$cv)){ cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x){x$predCV})) cvPrediction$evaluationType <- 'CV' @@ -127,7 +167,17 @@ fitCyclopsModel <- function( cvPerFold <- unlist(lapply(modelTrained$cv, function(x){x$out_sample_auc})) if(length(cvPerFold)>0){ - names(cvPerFold) <- paste0('fold_auc', 1:length(cvPerFold)) + cvPerFold <- data.frame( + metric = 'AUC', + fold = 1:length(cvPerFold), + value = cvPerFold, + startingVariance = ifelse(is.null(param$priorParams$variance), 'NULL', param$priorParams$variance), + lowerLimit = ifelse(is.null(param$lowerLimit), 'NULL', param$lowerLimit), + upperLimit = ifelse(is.null(param$upperLimit), 'NULL', param$upperLimit), + tolerance = ifelse(is.null(settings$tolerance), 'NULL', settings$tolerance) + ) + } else{ + cvPerFold <- data.frame() } # remove the cv from the model: @@ -136,39 +186,41 @@ fitCyclopsModel <- function( result <- list( model = modelTrained, + + preprocessing = list( + featureEngineering = attr(trainData, "metaData")$featureEngineering,#learned mapping + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, #learned mapping + requireDenseMatrix = F + ), + prediction = prediction, - settings = list( - plpDataSettings = attr(trainData, "metaData")$plpDataSettings, + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = attr(trainData, "metaData")$targetId, # added + outcomeId = attr(trainData, "metaData")$outcomeId, # added + restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, # made this restrictPlpDataSettings covariateSettings = attr(trainData, "metaData")$covariateSettings, - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - covariateMap = NULL, - requireDenseMatrix = F, - populationSettings = attr(trainData, "metaData")$populationSettings, - modelSettings = list( - model = settings$modelType, - param = param, - finalModelParameters = list( - variance = modelTrained$priorVariance, - log_likelihood = modelTrained$log_likelihood - ), - extraSettings = attr(param, 'settings') - ), + populationSettings = attr(trainData, "metaData")$populationSettings, + featureEngineeringSettings = attr(trainData, "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + modelSettings = modelSettings, #modified splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings - - ), trainDetails = list( - analysisId = analysisId, - cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - outcomeId = attr(trainData, "metaData")$outcomeId, - cohortId = attr(trainData, "metaData")$cohortId, + analysisId = analysisId, + analysisSource = '', #TODO add from model + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, - trainingTime = comp, + trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), + modelName = settings$modelType, + finalModelParameters = list( + variance = modelTrained$priorVariance, + log_likelihood = modelTrained$log_likelihood + ), hyperParamSearch = cvPerFold ), @@ -213,20 +265,20 @@ predictCyclops <- function(plpModel, data, cohort ) { # survival cyclops use baseline hazard to convert to risk from exp(LP) to 1-S^exp(LP) if(attr(plpModel, 'modelType') == 'survival'){ - if(!is.null(plpModel$model$baselineHazard)){ + if(!is.null(plpModel$model$baselineSurvival)){ if(is.null(attr(cohort, 'timepoint'))){ timepoint <- attr(cohort,'metaData')$populationSettings$riskWindowEnd } else{ timepoint <- attr(cohort, 'timepoint') } - bhind <- which.min(abs(plpModel$model$baselineHazard$time-timepoint)) - #prediction$value <- 1-plpModel$model$baselineHazard$surv[bhind]^prediction$value - prediction$value <- (1-plpModel$model$baselineHazard$surv[bhind])*prediction$value + bhind <- which.min(abs(plpModel$model$baselineSurvival$time-timepoint)) + # 1- baseline survival(time)^ (exp(betas*values)) + prediction$value <- 1-plpModel$model$baselineSurvival$surv[bhind]^prediction$value metaData <- list() - metaData$baselineHazardTimepoint <- plpModel$model$baselineHazard$time[bhind] - metaData$baselineHazard <- plpModel$model$baselineHazard$surv[bhind] + metaData$baselineSurvivalTimepoint <- plpModel$model$baselineSurvival$time[bhind] + metaData$baselineSurvival <- plpModel$model$baselineSurvival$surv[bhind] metaData$offset <- 0 attr(prediction, 'metaData') <- metaData @@ -246,11 +298,11 @@ predictCyclopsType <- function(coefficients, population, covariateData, modelTyp stop("Needs correct covariateData") } - intercept <- coefficients[names(coefficients)%in%'(Intercept)'] + intercept <- coefficients$betas[coefficients$covariateId%in%'(Intercept)'] if(length(intercept)==0) intercept <- 0 - coefficients <- coefficients[!names(coefficients)%in%'(Intercept)'] - coefficients <- data.frame(beta = as.numeric(coefficients), - covariateId = as.numeric(names(coefficients)) #!@ modified + betas <- coefficients$betas[!coefficients$covariateIds%in%'(Intercept)'] + coefficients <- data.frame(beta = betas, + covariateId = coefficients$covariateIds[coefficients$covariateIds!='(Intercept)'] ) coefficients <- coefficients[coefficients$beta != 0, ] if(sum(coefficients$beta != 0)>0){ @@ -262,7 +314,7 @@ predictCyclopsType <- function(coefficients, population, covariateData, modelTyp dplyr::mutate(values = .data$covariateValue*.data$beta) %>% dplyr::group_by(.data$rowId) %>% dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% - dplyr::select(.data$rowId, .data$value) + dplyr::select("rowId", "value") prediction <- as.data.frame(prediction) prediction <- merge(population, prediction, by ="rowId", all.x = TRUE, fill = 0) @@ -294,17 +346,21 @@ predictCyclopsType <- function(coefficients, population, covariateData, modelTyp } -createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, labels, folds){ +createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, labels, folds, + priorType){ if (is.character(fit)) { coefficients <- c(0) + names(coefficients) <- '' status <- fit } else if (fit$return_flag == "ILLCONDITIONED") { coefficients <- c(0) + names(coefficients) <- '' status <- "ILL CONDITIONED, CANNOT FIT" ParallelLogger::logWarn(paste("GLM fitting issue: ", status)) } else if (fit$return_flag == "MAX_ITERATIONS") { coefficients <- c(0) + names(coefficients) <- '' status <- "REACHED MAXIMUM NUMBER OF ITERATIONS, CANNOT FIT" ParallelLogger::logWarn(paste("GLM fitting issue: ", status)) } else { @@ -313,27 +369,33 @@ createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, ParallelLogger::logInfo(paste("GLM fit status: ", status)) } + # use a dataframe for the coefficients + betas <- as.numeric(coefficients) + betaNames <- names(coefficients) + coefficients <- data.frame(betas=betas, covariateIds=betaNames) + outcomeModel <- list( - coefficients = coefficients, priorVariance = fit$variance, log_likelihood = fit$log_likelihood, modelType = modelType, - modelStatus = status + modelStatus = status, + coefficients = coefficients ) if(modelType == "cox" || modelType == "survival") { - baselineHazard <- tryCatch({survival::survfit(fit, type = "aalen")}, + baselineSurvival <- tryCatch({survival::survfit(fit, type = "aalen")}, error = function(e) {ParallelLogger::logInfo(e); return(NULL)}) - if(is.null(baselineHazard)){ + if(is.null(baselineSurvival)){ ParallelLogger::logInfo('No baseline hazard function returned') } - outcomeModel$baselineHazard <- baselineHazard + outcomeModel$baselineSurvival <- baselineSurvival } class(outcomeModel) <- "plpModel" - #get CV - if(modelType == "logistic" && useCrossValidation){ - outcomeModel$cv <- getCV(cyclopsData, labels, cvVariance = fit$variance, folds = folds) + #get CV - added && status == "OK" to only run if the model fit sucsessfully + if(modelType == "logistic" && useCrossValidation && status == "OK"){ + outcomeModel$cv <- getCV(cyclopsData, labels, cvVariance = fit$variance, folds = folds, + priorType = priorType) } return(outcomeModel) @@ -366,10 +428,13 @@ getCV <- function( cyclopsData, labels, cvVariance, - folds + folds, + priorType ) { - fixed_prior <- Cyclops::createPrior("laplace", variance = cvVariance, useCrossValidation = FALSE) + fixed_prior <- Cyclops::createPrior(priorType = priorType, + variance = cvVariance, + useCrossValidation = FALSE) # add the index to the labels labels <- merge(labels, folds, by = 'rowId') @@ -402,8 +467,8 @@ getCV <- function( getVariableImportance <- function(modelTrained, trainData){ varImp <- data.frame( - covariateId = as.double(names(modelTrained$coefficients)[names(modelTrained$coefficients)!='(Intercept)']), - value = modelTrained$coefficients[names(modelTrained$coefficients)!='(Intercept)'] + covariateId = as.double(modelTrained$coefficients$covariateIds[modelTrained$coefficients$covariateIds!='(Intercept)']), + value = modelTrained$coefficients$betas[modelTrained$coefficients$covariateIds!='(Intercept)'] ) if(sum(abs(varImp$value)>0)==0){ @@ -420,7 +485,7 @@ if(sum(abs(varImp$value)>0)==0){ #dplyr::left_join(trainData$covariateData$varImp) %>% dplyr::left_join(varImp, by = 'covariateId') %>% dplyr::mutate(covariateValue = ifelse(is.na(.data$value), 0, .data$value)) %>% - dplyr::select(-.data$value) %>% + dplyr::select(-"value") %>% dplyr::arrange(-abs(.data$covariateValue)) %>% dplyr::collect() } @@ -446,3 +511,19 @@ filterCovariateIds <- function(param, covariateData){ } return(covariates) } + +reparamTransferCoefs <- function(inCoefs) { + transferCoefs <- inCoefs %>% + dplyr::filter(grepl("-", .data$covariateIds)) + + transferCoefs$covariateIds <- substring(transferCoefs$covariateIds, 2) + + originalCoefs <- inCoefs %>% + dplyr::filter(!grepl("-", .data$covariateIds)) + + coefs <- rbind(originalCoefs, transferCoefs) + coefs <- rowsum(coefs$betas, coefs$covariateIds) + coefs <- data.frame(betas = coefs, covariateIds = rownames(coefs), row.names = NULL) + + return(coefs) +} \ No newline at end of file diff --git a/R/CyclopsSettings.R b/R/CyclopsSettings.R index 9bc3766d4..fb688f03e 100644 --- a/R/CyclopsSettings.R +++ b/R/CyclopsSettings.R @@ -10,6 +10,7 @@ #' @param lowerLimit Numeric: Lower prior variance limit for grid-search #' @param tolerance Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence #' @param maxIterations Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error +#' @param priorCoefs Use coefficients from a previous model as starting points for model fit (transfer learning) #' #' @examples #' model.lr <- setLassoLogisticRegression() @@ -24,7 +25,9 @@ setLassoLogisticRegression<- function( upperLimit = 20, lowerLimit = 0.01, tolerance = 2e-06, - maxIterations = 3000){ + maxIterations = 3000, + priorCoefs = NULL + ){ checkIsClass(seed, c('numeric','NULL','integer')) if(is.null(seed[1])){ @@ -48,7 +51,8 @@ setLassoLogisticRegression<- function( ), includeCovariateIds = includeCovariateIds, upperLimit = upperLimit, - lowerLimit = lowerLimit + lowerLimit = lowerLimit, + priorCoefs = priorCoefs ) attr(param, 'settings') <- list( @@ -107,8 +111,6 @@ setCoxModel <- function( maxIterations = 3000 ){ - ensure_installed("survAUC") - checkIsClass(seed, c('numeric','NULL','integer')) if(is.null(seed[1])){ seed <- as.integer(sample(100000000,1)) @@ -204,7 +206,7 @@ setIterativeHardThresholding<- function( stop("forceIntercept must be of type: logical") if(!is.logical(fitBestSubset)) stop("fitBestSubset must be of type: logical") - if(!class(seed)%in%c('numeric','NULL','integer')) + if(!inherits(x = seed, what = c('numeric','NULL','integer'))) stop('Invalid seed') diff --git a/R/DataSplitting.R b/R/DataSplitting.R index 0c5677759..93c7a060e 100644 --- a/R/DataSplitting.R +++ b/R/DataSplitting.R @@ -17,7 +17,7 @@ #' Create the settings for defining how the plpData are split into test/validation/train sets using -#' default splitting functions (either random stratified by outcome, time or subject splitting). +#' default splitting functions (either random stratified by outcome, time or subject splitting) #' #' @details #' Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings @@ -28,9 +28,9 @@ #' @param nfold (numeric) An integer > 1 specifying the number of folds used in cross validation #' @param splitSeed (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated) #' @param type (character) Choice of: \itemize{ -#' \item{'stratified'}{ Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition } -#' \item{'time')}{ Older data are assigned into the training set and newer data are assigned into the test set} -#' \item{'subject'}{ Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).} +#' \item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition +#' \item'time' Older data are assigned into the training set and newer data are assigned into the test set +#' \item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both). #' } #' #' @return @@ -87,17 +87,17 @@ createDefaultSplitSetting <- function(testFraction=0.25, #' #' @details #' Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing -#' \itemize{\item{covariates}{ a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data } -#' \item{covariateRef}{ a table with the covariate information} -#' \item{labels)}{ a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) } -#' \item{folds}{ a table (rowId, index) specifying which training fold each data point is in.} +#' \itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data +#' \item covariateRef: a table with the covariate information +#' \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) +#' \item folds: a table (rowId, index) specifying which training fold each data point is in. #' } #' Test is an Andromeda object containing -#' \itemize{\item{covariates}{ a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data } -#' \item{covariateRef}{ a table with the covariate information} -#' \item{labels)}{ a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) } +#' \itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data +#' \item covariateRef: a table with the covariate information +#' \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) #' } -#' +#' #' #' #' @param plpData An object of type \code{plpData} - the patient level prediction @@ -146,9 +146,11 @@ splitData <- function(plpData = plpData, #trainData$covariateData$covariateRef <- plpData$covariateRef attr(trainData, "metaData") <- list( outcomeId = attr(population, "metaData")$outcomeId, - cohortId = attr(population, "metaData")$cohortId, + targetId = attr(population, "metaData")$targetId, cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema, - plpDataSettings = attr(population, "metaData")$plpDataSettings, + cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName, + cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId, + restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings, covariateSettings = plpData$metaData$covariateSettings, populationSettings = attr(population, "metaData")$populationSettings, attrition = attr(population, "metaData")$attrition, @@ -183,9 +185,11 @@ splitData <- function(plpData = plpData, } attr(trainData, "metaData") <- list( outcomeId = attr(population, "metaData")$outcomeId, - cohortId = attr(population, "metaData")$cohortId, + targetId = attr(population, "metaData")$targetId, cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema, - plpDataSettings = attr(population, "metaData")$plpDataSettings, + cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName, + cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId, + restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings, covariateSettings = plpData$metaData$covariateSettings, populationSettings = attr(population, "metaData")$populationSettings, attrition = attr(population, "metaData")$attrition, @@ -193,10 +197,6 @@ splitData <- function(plpData = plpData, populationSize = nrow(trainData$labels) ) - # add pop size to covariateData as used in tidyCovariates - attr(trainData$covariateData, "metaData") <- list(populationSize = nrow(trainData$labels)) - class(trainData$covariateData) <- "CovariateData" - testId <- splitId[splitId$index<0,] testData <- list() class(testData) <- 'plpData' @@ -212,7 +212,6 @@ splitData <- function(plpData = plpData, data.frame(rowId = testId$rowId), sizeN = 10000000) } - class(testData$covariateData) <- "CovariateData" result <- list( Train = trainData, diff --git a/R/DatabaseMigration.R b/R/DatabaseMigration.R new file mode 100644 index 000000000..bc480ce32 --- /dev/null +++ b/R/DatabaseMigration.R @@ -0,0 +1,68 @@ +# @file DatabaseMigration.R +# +# Copyright 2023 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitatons under the License. +# +#' Migrate Data model +#' @description +#' Migrate data from current state to next state +#' +#' It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +#' are using a postgres backend) or have kept the csv/zip files from your data generation. +#' +#' @param connectionDetails DatabaseConnector connection details object +#' @param databaseSchema String schema where database schema lives +#' @param tablePrefix (Optional) Use if a table prefix is used before table names (e.g. "cd_") +#' +#' @export +migrateDataModel <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ParallelLogger::logInfo("Migrating data set") + + migrator <- getDataMigrator( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix + ) + migrator$executeMigrations() + migrator$finalize() + + ParallelLogger::logInfo("Updating version number") + updateVersionSql <- SqlRender::loadRenderTranslateSql( + "UpdateVersionNumber.sql", + packageName = utils::packageName(), + database_schema = databaseSchema, + table_prefix = tablePrefix, + version_number = utils::packageVersion("PatientLevelPrediction"), + dbms = "sql server" # this is the same for all dbms so just using sql server + ) + + connection <- DatabaseConnector::connect(connectionDetails = connectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + DatabaseConnector::executeSql(connection, updateVersionSql) +} + + +getDataMigrator <- function(connectionDetails, databaseSchema, tablePrefix = "") { + ensure_installed("ResultModelManager") + + ResultModelManager::DataMigrationManager$new( + connectionDetails = connectionDetails, + databaseSchema = databaseSchema, + tablePrefix = tablePrefix, + migrationPath = "migrations", + packageName = utils::packageName() + ) +} \ No newline at end of file diff --git a/R/DemographicSummary.R b/R/DemographicSummary.R index 908e5e2de..b3df14305 100644 --- a/R/DemographicSummary.R +++ b/R/DemographicSummary.R @@ -44,7 +44,7 @@ getDemographicSummary_binary <- function(prediction, evalColumn , ...){ ageGroup = paste0('Age group: ', floor(.data$ageYear/5)*5, '-',floor(.data$ageYear/5)*5+4), genId = .data$gender, genGroup = ifelse(.data$gender==8507, 'Male', 'Female')) %>% - dplyr::select(.data$rowId,.data$ageId,.data$ageGroup,.data$genId,.data$genGroup ) %>% + dplyr::select("rowId","ageId","ageGroup","genId","genGroup") %>% dplyr::inner_join(predictionOfInterest[,colnames(predictionOfInterest)%in%c('rowId', 'value','outcomeCount','survivalTime')], by='rowId') demographicData <- demographicData %>% @@ -88,7 +88,7 @@ getDemographicSummary_survival <- function(prediction, evalColumn, timepoint = N ageGroup = paste0('Age group: ', floor(.data$ageYear/5)*5, '-',floor(.data$ageYear/5)*5+4), genId = .data$gender, genGroup = ifelse(.data$gender==8507, 'Male', 'Female')) %>% - dplyr::select(.data$rowId,.data$ageId,.data$ageGroup,.data$genId,.data$genGroup ) %>% + dplyr::select("rowId","ageId","ageGroup","genId","genGroup" ) %>% dplyr::inner_join(predictionOfInterest[,colnames(predictionOfInterest)%in%c('rowId', 'value','outcomeCount','survivalTime')], by='rowId') @@ -111,30 +111,30 @@ getDemographicSummary_survival <- function(prediction, evalColumn, timepoint = N tempDemo <- demographicSum %>% dplyr::filter( .data$genGroup == gen & .data$ageGroup == age ) - if(nrow(tempDemo)>0){ - t1 <- tempDemo %>% dplyr::select(.data$t) - y1 <- tempDemo %>% dplyr::select(.data$y) - p1 <- tempDemo %>% dplyr::select(.data$value) + if (nrow(tempDemo) > 1 & length(unique(tempDemo$y)) > 1) { + t <- tempDemo$t + y <- tempDemo$y + value <- tempDemo$value out <- tryCatch( { summary( - survival::survfit(survival::Surv(t1$t, y1$y) ~ 1), + survival::survfit(survival::Surv(t, y) ~ 1), times = timepoint ) }, - error = function(e){ParallelLogger::logError(e); return(NULL)} + error = function(e){ParallelLogger::logError(e);return(NULL)} ) if(!is.null(out)){ demoTemp <- c( genGroup = gen, ageGroup = age, - PersonCountAtRisk = length(p1$value), - PersonCountWithOutcome = round(length(p1$value)*(1-out$surv)), + PersonCountAtRisk = length(value), + PersonCountWithOutcome = round(length(value)*(1-out$surv)), observedRisk = 1-out$surv, - averagePredictedProbability = mean(p1$value, na.rm = T), - StDevPredictedProbability = stats::sd(p1$value, na.rm = T) + averagePredictedProbability = mean(value, na.rm = T), + StDevPredictedProbability = stats::sd(value, na.rm = T) ) demographicData <- rbind(demographicData, demoTemp) diff --git a/R/DiagnosePlp.R b/R/DiagnosePlp.R new file mode 100644 index 000000000..dee4bba82 --- /dev/null +++ b/R/DiagnosePlp.R @@ -0,0 +1,933 @@ +# @file Diagnostics.R +# +# Copyright 2021 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Run a list of predictions diagnoses +#' +#' @details +#' This function will run all specified prediction design diagnoses as defined using . +#' +#' @param databaseDetails The database settings created using \code{createDatabaseDetails()} +#' @param modelDesignList A list of model designs created using \code{createModelDesign()} +#' @param cohortDefinitions A list of cohort definitions for the target and outcome cohorts +#' @param logSettings The setting spexcifying the logging for the analyses created using \code{createLogSettings()} +#' @param saveDirectory Name of the folder where all the outputs will written to. +#' +#' @return +#' A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier +#' for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr +#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved +#' \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr } +#' +#' @export +diagnoseMultiplePlp <- function( + databaseDetails = createDatabaseDetails(), + modelDesignList = list( + createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), + createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression()) + ), + cohortDefinitions = NULL, + logSettings = createLogSettings( + verbosity = "DEBUG", + timeStamp = T, + logName = "diagnosePlp Log" + ), + saveDirectory = getwd() +){ + + #input checks + checkIsClass(databaseDetails, c('databaseDetails')) + checkIsClass(modelDesignList, c('list', 'modelDesign')) + checkIsClass(logSettings, 'logSettings') + checkIsClass(saveDirectory, 'character') + if(!dir.exists(saveDirectory)){ + dir.create(saveDirectory, recursive = T) + } + + if(is.null(cohortDefinitions)){ + + cohortIds <- unlist( + lapply( + X = 1:length(modelDesignList), + FUN = function(i){ + c( + modelDesignList[[i]]$targetId, + modelDesignList[[i]]$outcomeId + ) + } + ) + ) + + cohortDefinitions <- lapply( + X = cohortIds, + FUN = function(x){ + list( + id = x, + name = paste0('Cohort: ', x) + ) + } + ) + + } + + settingstable <- convertToJson(modelDesignList,cohortDefinitions) # from runMultiplePlp.R + + if(nrow(settingstable) != length(modelDesignList)){ + stop('Error in settingstable') + } + + # save the settings: TODO fix + utils::write.csv( + x = settingstable %>% dplyr::select( + "analysisId", + "targetId", + "targetName", + "outcomeId", + "outcomeName", + "dataLocation" + ), + file.path(saveDirectory,'settings.csv'), + row.names = F + ) + + # group the outcomeIds per combination of data extraction settings + dataSettings <- settingstable %>% + dplyr::group_by( + .data$targetId, + .data$covariateSettings, + .data$restrictPlpDataSettings, + .data$dataLocation + ) %>% + dplyr::summarise( + outcomeIds = paste(unique(.data$outcomeId), collapse = ',') + ) + + # extract data + for(i in 1:nrow(as.data.frame(dataSettings))){ + dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i])))>0 + if(!dataExists){ + ParallelLogger::logInfo(paste('Extracting data for cohort', dataSettings$targetId[i], 'to', file.path(saveDirectory, dataSettings$dataLocation[i]))) + + databaseDetails$targetId <- dataSettings$targetId[i] + databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ',')[[1]] + + plpDataSettings <- list( + databaseDetails = databaseDetails, + covariateSettings = ParallelLogger::convertJsonToSettings(dataSettings$covariateSettings[i]), + restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(dataSettings$restrictPlpDataSettings[i]) + ) + + plpData <- tryCatch( + {do.call(getPlpData, plpDataSettings)}, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + if(!is.null(plpData)){ + savePlpData(plpData, file.path(saveDirectory, dataSettings$dataLocation[i])) + } + } else{ + ParallelLogger::logInfo(paste('Data for cohort', dataSettings$targetId[i], 'exists at', file.path(saveDirectory, dataSettings$dataLocation[i]))) + } + } + + # diagnosePlp + for(i in 1:nrow(as.data.frame(settingstable))){ + modelDesign <- modelDesignList[[i]] + settings <- settingstable[i,] # just the data locations? + + dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0 + + if(dataExists){ + plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) + + diagnoseExists <- file.exists(file.path(saveDirectory, settings$analysisId, 'diagnosePlp.rds')) + if(!diagnoseExists){ + + diagnosePlpSettings <- list( + plpData = plpData, + outcomeId = modelDesign$outcomeId, + analysisId = settings$analysisId, + populationSettings = modelDesign$populationSettings, + splitSettings = modelDesign$splitSettings, + sampleSettings = modelDesign$sampleSettings, + featureEngineeringSettings = modelDesign$featureEngineeringSettings, + preprocessSettings = modelDesign$preprocessSettings, + modelSettings = modelDesign$modelSettings, + logSettings = logSettings, + saveDirectory = saveDirectory + ) + + result <- tryCatch( + {do.call(diagnosePlp, diagnosePlpSettings)}, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + } else{ + ParallelLogger::logInfo(paste('Diagnosis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId))) + } + } # end run per setting + + } + return(invisible(settingstable)) +} + + +#' diagnostic - Investigates the prediction problem settings - use before training a model +#' +#' @description +#' This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine +#' whether the prediction problem is worth executing. +#' +#' @details +#' Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as +#' follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, +#' observation time distribution. +#' +#' @param plpData An object of type \code{plpData} - the patient level prediction +#' data extracted from the CDM. Can also include an initial population as +#' plpData$popualtion. +#' @param outcomeId (integer) The ID of the outcome. +#' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp. +#' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that +#' specifies how the data class labels are defined and addition any exclusions to apply to the +#' plpData cohort +#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test. +#' The default settings can be created using \code{createDefaultSplitSetting}. +#' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done. +#' The default is none. +#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data) +#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of +#' target population who must have a covariate for it to be included in the model training +#' and whether to normalise the covariates before training +#' @param modelSettings An object of class \code{modelSettings} created using one of the function: +#' \itemize{ +#' \item setLassoLogisticRegression() A lasso logistic regression model +#' \item setGradientBoostingMachine() A gradient boosting machine +#' \item setAdaBoost() An ada boost model +#' \item setRandomForest() A random forest model +#' \item setDecisionTree() A decision tree model +#' \item setKNN() A KNN model +#' +#' } +#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} +#' specifying how the logging is done +#' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory) +#' +#' @return +#' An object containing the model or location where the model is save, the data selection settings, the preprocessing +#' and training settings as well as various performance measures obtained by the model. +#' +#' \item{distribution}{list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index) } +#' \item{incident}{list for each O of incidence of O in T during TAR} +#' \item{characterization}{list for each O of Characterization of T, TnO, Tn~O} +#' +#' +#' @export +#' @examples +#' \dontrun{ +#' #******** EXAMPLE 1 ********* +#' } +diagnosePlp <- function( + plpData = NULL, + outcomeId, + analysisId, + populationSettings, + splitSettings = createDefaultSplitSetting(), + sampleSettings = createSampleSettings(), # default none + saveDirectory = NULL, + featureEngineeringSettings = createFeatureEngineeringSettings(), # default none + modelSettings = setLassoLogisticRegression(), # default to logistic regression + logSettings = createLogSettings( + verbosity = 'DEBUG', + timeStamp = T, + logName = 'diagnosePlp Log' + ), + preprocessSettings = createPreprocessSettings() +){ + + # start log + analysisPath <- file.path(saveDirectory, analysisId) + logSettings$saveDirectory <- analysisPath + logSettings$logFileName <- 'plpLog' + logger <- do.call(createLog,logSettings) + ParallelLogger::registerLogger(logger) + on.exit(closeLog(logger)) + + participantsDiag <- probastParticipants( + plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + predictorDiag <- probastPredictors( + plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + outcomeDiag <- probastOutcome( + plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + designDiag <- probastDesign( + plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + # Question: what about + # splitSettings, sampleSettings, + # FeatureEngineeringSettings, modelSettings + + result <- list( + summary = rbind( + participantsDiag$diagnosticParticipantsAggregate, + predictorDiag$diagnosticPredictorsAggregate, + outcomeDiag$diagnosticOutcomeAggregate, + designDiag$diagnosticDesignAggregate + ), + participants = participantsDiag$diagnosticParticipantsFull, + predictors = predictorDiag$diagnosticPredictorsFull, + outcomes = outcomeDiag$diagnosticOutcomeFull, + designs = designDiag$diagnosticDesignFull, + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = attr(plpData$cohorts, "metaData")$targetId, + outcomeId = outcomeId, + restrictPlpDataSettings = plpData$metaData$restrictPlpDataSettings, + covariateSettings = plpData$metaData$covariateSettings, + populationSettings = populationSettings, + featureEngineeringSettings = featureEngineeringSettings, + preprocessSettings = preprocessSettings, + modelSettings = modelSettings, + splitSettings = splitSettings, + sampleSettings = sampleSettings + ), + databaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema, + databaseId = plpData$metaData$databaseDetails$cdmDatabaseId + ) + + class(result) <- 'diagnosePlp' + + if(!is.null(saveDirectory)){ + if(!dir.exists(file.path(saveDirectory, analysisId))){ + dir.create(file.path(saveDirectory, analysisId), recursive = T) + } + saveLocation <- file.path(saveDirectory, analysisId, 'diagnosePlp.rds') + ParallelLogger::logInfo(paste0('Saving diagnosePlp to ', saveLocation)) + saveRDS(result, saveLocation) + } + + return(result) +} + +probastDesign <- function( + plpData, + outcomeId, + populationSettings +){ + + diagnosticAggregate <- c() + diagnosticFull <- c() + + population <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + probastId <- '4.1' + if(min(sum(population$outcomeCount > 0),nrow(population) - sum(population$outcomeCount > 0)) >= 1000){ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + }else if((min(sum(population$outcomeCount > 0),nrow(population) - sum(population$outcomeCount > 0)) >= 100)){ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Unknown') + ) + } else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Fail') + ) + } + + if(!is.null(dim(diagnosticAggregate))){ + diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>% + dplyr::mutate_if(is.factor, as.character) + colnames(diagnosticAggregate) <- c('probastId','resultValue') + } + + return( + list + ( + diagnosticDesignFull = diagnosticFull, + diagnosticDesignAggregate = diagnosticAggregate + ) + ) + + +} + +probastParticipants <- function( + plpData, + outcomeId, + populationSettings +){ + + diagnosticAggregate <- c() + diagnosticFull <- c() + + # true due to plp + probastId <- '1.1' + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + + # appropriate inclusions + ## 1.2.1 min prior observation + if(populationSettings$washoutPeriod != 0 ){ + var <- 'washoutPeriod' + probastId <- '1.2.1' + + result <- getDiagnostic( + probastId, + plpData, + outcomeId, + populationSettings, + var, + 0 + ) + + diagnosticAggregate <- rbind( + diagnosticAggregate, + result$diagnosticAggregate + ) + + diagnosticFull <- rbind( + diagnosticFull, + result$diagnosticFull + ) + + } + + ## 1.2.2 min time-at-risk + if(populationSettings$requireTimeAtRisk & populationSettings$minTimeAtRisk > 0){ + probastId <- '1.2.2' + var <- 'minTimeAtRisk' + + result <- getDiagnostic( + probastId, + plpData, + outcomeId, + populationSettings, + var, + 0 + ) + + diagnosticAggregate <- rbind( + diagnosticAggregate, + result$diagnosticAggregate + ) + + diagnosticFull <- rbind( + diagnosticFull, + result$diagnosticFull + ) + + } + + + ## 1.2.3 first exposure only + if(populationSettings$firstExposureOnly){ + probastId <- '1.2.3' + var <- 'firstExposureOnly' + default <- F + + result <- getDiagnostic( + probastId, + plpData, + outcomeId, + populationSettings, + var, + default + ) + + diagnosticAggregate <- rbind( + diagnosticAggregate, + result$diagnosticAggregate + ) + + diagnosticFull <- rbind( + diagnosticFull, + result$diagnosticFull + ) + + } + + ## 1.2.4 prior observation + if(populationSettings$removeSubjectsWithPriorOutcome & populationSettings$priorOutcomeLookback > 0){ + probastId <- '1.2.4' + var <- 'priorOutcomeLookback' + default <- 0 + + result <- getDiagnostic( + probastId, + plpData, + outcomeId, + populationSettings, + var, + default + ) + + diagnosticAggregate <- rbind( + diagnosticAggregate, + result$diagnosticAggregate + ) + + diagnosticFull <- rbind( + diagnosticFull, + result$diagnosticFull + ) + + } + + if(!is.null(dim(diagnosticAggregate))){ + diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>% + dplyr::mutate_if(is.factor, as.character) + colnames(diagnosticAggregate) <- c('probastId','resultValue') + } + + return( + list + ( + diagnosticParticipantsFull = diagnosticFull, + diagnosticParticipantsAggregate = diagnosticAggregate + ) + ) +} + + +getMaxEndDaysFromCovariates <- function(covariateSettings){ + + if(inherits(covariateSettings, 'covariateSettings')){ + covariateSettings <- list(covariateSettings) + } + + vals <- unlist(lapply(covariateSettings, function(x){x$endDays})) + + if(length(vals) == 0){ + return(0) + } else{ + return(max(vals)) + } +} + + + +probastPredictors <- function( + plpData, + outcomeId, + populationSettings +){ + + diagnosticAggregate <- c() + diagnosticFull <- c() + + # true due to plp + probastId <- '2.1' + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + + # 2.2.1 + # cov end date < tar_start + # covariate + outcome correlation; km of outcome (close to index or not)? + probastId <- '2.2' + if(populationSettings$startAnchor == 'cohort start'){ + if(populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)){ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + } else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Fail') + ) + } + } else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Unknown') + ) + } + + # KM of outcome + populationSettingsFull <- populationSettings + populationSettingsFull$riskWindowEnd <- 10*365 + + population <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + populationFull <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettingsFull + ) + + #dayOfEvent, outcomeAtTime, observedAtStartOfDay + kmObservation <- population %>% + dplyr::group_by(.data$daysToEvent) %>% + dplyr::summarise( + outcomeAtTime = sum(!is.na(.data$daysToEvent)) + ) + + kmObservation$observedAtStartOfDay <- unlist( + lapply( + kmObservation$daysToEvent, + function(x){ + population %>% + dplyr::filter(.data$survivalTime >= x) %>% + dplyr::tally() %>% + dplyr::select("n") + } + ) + ) + + kmObservation$probastId <- probastId + kmObservation$inputType <- 'populationSettings' + + kmObservationFull <- populationFull %>% + dplyr::group_by(.data$daysToEvent) %>% + dplyr::summarise( + outcomeAtTime = sum(!is.na(.data$daysToEvent)) + ) + + kmObservationFull$observedAtStartOfDay <- unlist( + lapply( + kmObservationFull$daysToEvent, + function(x){ + populationFull %>% + dplyr::filter(.data$survivalTime >= x) %>% + dplyr::tally() %>% + dplyr::select("n") + } + ) + ) + + kmObservationFull$probastId <- probastId + kmObservationFull$inputType <- '10-year' + + diagnosticFull <- rbind(kmObservation, kmObservationFull) + + + # 2.3.1 + # cov end_date <=0 + probastId <- '2.3' + if(getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) <= 0){ + + if(getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) < 0){ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + } + else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Unknown') + ) + } + } else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Fail') + ) + } + + if(!is.null(dim(diagnosticAggregate))){ + diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>% + dplyr::mutate_if(is.factor, as.character) + colnames(diagnosticAggregate) <- c('probastId','resultValue') + } + + return( + list + ( + diagnosticPredictorsFull = diagnosticFull, + diagnosticPredictorsAggregate = diagnosticAggregate + ) + ) + +} + + + +probastOutcome <- function( + plpData, + outcomeId, + populationSettings +){ + + diagnosticAggregate <- c() + diagnosticFull <- c() + + # true due to plp + probastId <- '3.4' + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + + # 3.5 - check the outcome definition doesn't use things before index? + + # 3.6 - check tar after covariate end_days + probastId <- '3.6' + if(populationSettings$startAnchor == 'cohort start'){ + if(populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)){ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Pass') + ) + } else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Fail') + ) + } + } else{ + diagnosticAggregate <- rbind( + diagnosticAggregate, + c(probastId, 'Unknown') + ) + } + + # 3.1.1 - check the outcome rate per gender/age/index year + probastId <- '3.1.1' + + # all cohort vs pop + pop <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + popSum <- getOutcomeSummary( + type = 'population', + population = pop + ) + + cohort <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = PatientLevelPrediction::createStudyPopulationSettings( + includeAllOutcomes = F, + firstExposureOnly = F, + washoutPeriod = 0, + removeSubjectsWithPriorOutcome = F, + priorOutcomeLookback = 0, + requireTimeAtRisk = F, + minTimeAtRisk = 0, + riskWindowStart = populationSettings$riskWindowStart, + startAnchor = populationSettings$startAnchor, + riskWindowEnd = populationSettings$riskWindowEnd, + endAnchor = populationSettings$endAnchor + ) + ) + + cohortSum <- getOutcomeSummary( + type = 'cohort', + population = cohort + ) + + + + #dayOfEvent, outcomeAtTime, observedAtStartOfDay + diagnosticFull <- rbind( + do.call(rbind, popSum), + do.call(rbind, cohortSum) + ) + + if(!is.null(dim(diagnosticAggregate))){ + diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>% + dplyr::mutate_if(is.factor, as.character) + colnames(diagnosticAggregate) <- c('probastId','resultValue') + } + + return( + list + ( + diagnosticOutcomeFull = diagnosticFull, + diagnosticOutcomeAggregate = diagnosticAggregate + ) + ) + +} + +getOutcomeSummary <- function( + type = 'population', + population +){ + + res <- list() + length(res) <- 4 + probastId <- '3' + + res[[1]] <- population %>% + dplyr::group_by(.data$ageYear) %>% + dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>% + dplyr::mutate( + probastId = probastId, + aggregation = 'age', + inputType = type + ) %>% + dplyr::rename(xvalue = "ageYear") + + res[[2]] <- population %>% + dplyr::group_by(.data$gender) %>% + dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>% + dplyr::mutate( + probastId = probastId, + aggregation = 'gender', + inputType = type + )%>% + dplyr::rename(xvalue = "gender") + + res[[3]] <- population %>% + dplyr::mutate( + year = substring(.data$cohortStartDate,1,4) + ) %>% + dplyr::group_by(.data$year) %>% + dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>% + dplyr::mutate( + probastId = probastId, + aggregation = 'year', + inputType = type + ) %>% + dplyr::rename(xvalue = "year") + + res[[4]] <- population %>% + dplyr::mutate( + year = substring(.data$cohortStartDate,6,7) + ) %>% + dplyr::group_by(.data$year) %>% + dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>% + dplyr::mutate( + probastId = probastId, + aggregation = 'month', + inputType = type + ) %>% + dplyr::rename(xvalue = "year") + + return(res) +} + +cos_sim <- function(a,b) +{ + return( sum(a*b)/sqrt(sum(a^2)*sum(b^2)) ) +} + +getDiagnostic <- function( + probastId, + plpData, + outcomeId, + populationSettings, + var, + defaultValue = 0 +){ + ParallelLogger::logInfo(paste0('Diagnosing impact of ',var,' in populationSettings')) + + populationSettingsCheck <- populationSettings + populationSettingsCheck[var] <- defaultValue + + pop <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettings + ) + + popCheck <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettingsCheck + ) + + #compare the populations: + diag <- rbind( + data.frame( + probastId = probastId, + design = paste0(var,': ', defaultValue), + metric = c( + 'N', 'outcomePercent', 'minAge', + 'meanAge', 'medianAge', 'maxAge', + 'malePercent' + ), + value = c( + nrow(popCheck), sum(popCheck$outcomeCount>0)/nrow(popCheck)*100, min(popCheck$ageYear), + mean(popCheck$ageYear), stats::median(popCheck$ageYear), max(popCheck$ageYear), + sum(popCheck$gender == 8507)/nrow(popCheck)*100 + ) + ), + data.frame( + probastId = probastId, + design = paste0(var,': ',populationSettings[var]), + metric = c( + 'N', 'outcomePercent', 'minAge', + 'meanAge', 'medianAge', 'maxAge', + 'malePercent' + ), + value = c( + nrow(pop), sum(pop$outcomeCount>0)/nrow(pop)*100, min(pop$ageYear), + mean(pop$ageYear), stats::median(pop$ageYear), max(pop$ageYear), + sum(pop$gender == 8507)/nrow(pop)*100 + ) + ) + ) + + diagSim <- cos_sim( + diag %>% + dplyr::filter(.data$design == unique(diag$design)[1]) %>% + dplyr::filter(.data$metric != 'N') %>% + dplyr::arrange(.data$metric) %>% + dplyr::select("value") + , + diag %>% + dplyr::filter(.data$design == unique(diag$design)[2]) %>% + dplyr::filter(.data$metric != 'N') %>% + dplyr::arrange(.data$metric) %>% + dplyr::select("value") + ) + + + return( + list( + diagnosticAggregate = c( + probastId, + diagSim + ), + diagnosticFull = diag + ) + ) +} + diff --git a/R/Diagnostics.R b/R/Diagnostics.R deleted file mode 100644 index 5be89f0f5..000000000 --- a/R/Diagnostics.R +++ /dev/null @@ -1,501 +0,0 @@ -# @file Diagnostics.R -# -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' diagnostic - Investigates the prediction problem settings - use before training a model -#' -#' @description -#' This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine -#' whether the prediction problem is worth executing. -#' -#' @details -#' Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as -#' follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, -#' observation time distribution. -#' -#' @param plpData The data object to do the diagnostic on - if NULL you need to specify the connection settings below -#' @param cdmDatabaseName The name of the database being diagnosed -#' @param cohortName Name of the target cohort -#' @param outcomeNames Vector of outcome names -#' @param databaseDetails (only used is plpData is NULL) The database details created using \code{createDatabaseDetails} -#' @param restrictPlpDataSettings (only used is plpData is NULL) The restrictPlpDataSettings created using \code{createRestrictPlpDataSettings} -#' @param populationSettings The population setting details created using \code{createPopulationSettings} -#' @param outputFolder Location to save results for shiny app -#' @param minCellCount The minimum count that will be displayed -#' -#' @return -#' An object containing the model or location where the model is save, the data selection settings, the preprocessing -#' and training settings as well as various performance measures obtained by the model. -#' -#' \item{distribution}{list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index) } -#' \item{incident}{list for each O of incidence of O in T during TAR} -#' \item{characterization}{list for each O of Characterization of T, TnO, Tn~O} -#' -#' -#' @export -#' @examples -#' \dontrun{ -#' #******** EXAMPLE 1 ********* -#' } -diagnostic <- function( - plpData = NULL, - cdmDatabaseName = 'none', - cohortName, - outcomeNames, - databaseDetails, - restrictPlpDataSettings, - populationSettings, - outputFolder = NULL, - minCellCount = 5 -){ - - if(is.null(plpData)){ - checkIsClass(databaseDetails, 'databaseDetails') - cdmDatabaseName <- attr(databaseDetails, 'cdmDatabaseName') - checkIsClass(restrictPlpDataSettings, 'restrictPlpDataSettings') - } - - if(class(populationSettings) != 'list'){ - populationSettings <- list(populationSettings) - } - lapply(populationSettings, function(x) checkIsClass(x, 'populationSettings')) - - if(!is.null(outputFolder)){ - if(!dir.exists(file.path(outputFolder))){ - dir.create(file.path(outputFolder), recursive = T) - } - } - - if(!is.null(plpData)){ - cohortId <- unique(plpData$cohorts$cohortId) - outcomeIds <- unique(plpData$outcomes$outcomeId) - } else{ - cohortId <- databaseDetails$cohortId - outcomeIds <- databaseDetails$outcomeIds - } - - #create cohort names csv: - if(file.exists(file.path(outputFolder,'namesdetails.csv'))){ - cohortNames <- utils::read.csv(file.path(outputFolder,'namesdetails.csv')) - - newNames <- data.frame(ids = c(cohortId,outcomeIds), - names = c(cohortName,outcomeNames)) - - newNames<- newNames[!newNames$ids%in%cohortNames$ids,] - if(length(newNames$ids)>0){ - cohortNames <- rbind(cohortNames, newNames) - } - - } else { - cohortNames <- data.frame(ids = c(cohortId,outcomeIds), - names = c(cohortName,outcomeNames)) - } - ParallelLogger::logInfo('Saving cohort names to csv') - utils::write.csv(cohortNames, file.path(outputFolder,'namesdetails.csv'), row.names = F) - - #create settings: - if(file.exists(file.path(outputFolder,'settings.csv'))){ - settings <- utils::read.csv(file.path(outputFolder,'settings.csv')) - } else{ - settings <- c() - } - maxAnalysis <- ifelse(is.null(settings$analysisId), 0, max(settings$analysisId)) - for(i in 1:length(populationSettings)){ - for( j in 1:length(outcomeIds)){ - maxAnalysis <- maxAnalysis + 1 - settingsTemp <- data.frame(analysisId = maxAnalysis, - cdmDatabaseName = cdmDatabaseName, - cohortId = cohortId, - outcomeId = outcomeIds[j], - riskWindowStart = populationSettings[[i]]$riskWindowStart, - startAnchor = populationSettings[[i]]$startAnchor, - riskWindowEnd = populationSettings[[i]]$riskWindowEnd, - endAnchor = populationSettings[[i]]$endAnchor - ) - settings <- unique(rbind(settings, settingsTemp)) - } - } - - ParallelLogger::logInfo('Saving settings to csv') - utils::write.csv(settings, file.path(outputFolder,'settings.csv'), row.names = F) - - - if(is.null(plpData)){ - # get outcome and cohort data - dont need covariates - - ParallelLogger::logInfo('Extracting data') - data <- do.call( - getPlpData, - list( - databaseDetails = databaseDetails, - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), - restrictPlpDataSettings = restrictPlpDataSettings - ) - ) - } else { - data <- plpData - } - - outcomeIds <- unique(data$outcomes$outcomeId) - - ParallelLogger::logInfo('Calculating distributions') - distribution <- getDistribution(cohort = data$cohorts, - outcomes = data$outcomes, - outputFolder = outputFolder, - databaseName = cdmDatabaseName) - - - # get survival data: - ParallelLogger::logInfo('Calculating survival data') - if(file.exists(file.path(outputFolder, 'survival.csv'))){ - surv <- utils::read.csv(file.path(outputFolder, 'survival.csv')) - } else { - surv <- c() - } - survTemp <- lapply(outcomeIds, function(oi) getSurvival(plpData = data, - outcomeId = oi, - cohortId = cohortId, - cdmDatabaseName = cdmDatabaseName )) - surv <- unique(rbind(surv, do.call('rbind', survTemp))) - if(!is.null(outputFolder)){ - utils::write.csv(surv, file.path(outputFolder, 'survival.csv'), row.names = F) - } - - # do characterisation - needs TAR - ParallelLogger::logInfo('Calculating proportion and characterizations') - - if(file.exists(file.path(outputFolder, 'proportion.csv'))){ - proportion <- utils::read.csv(file.path(outputFolder, 'proportion.csv')) - } else { - proportion <- c() - } - - if(file.exists(file.path(outputFolder, 'characterization.csv'))){ - characterization <- utils::read.csv(file.path(outputFolder, 'characterization.csv')) - } else { - characterization <- c() - } - for(i in 1:length(outcomeIds)){ - oi <- outcomeIds[i] - for(j in 1:length(populationSettings)){ - - population <- createStudyPopulation( - plpData = data, - outcomeId = oi, - populationSettings = populationSettings[[j]] - ) - - analysisId <- getAnalysisId( - settings = settings, - cohortId = cohortId, - outcomeId = oi, - riskWindowStart = populationSettings[[j]]$riskWindowStart, - startAnchor = populationSettings[[j]]$startAnchor, - riskWindowEnd = populationSettings[[j]]$riskWindowEnd, - endAnchor = populationSettings[[j]]$endAnchor - ) - - proportionTemp <- getProportions( - population, - analysisId = analysisId, - cdmDatabaseName = cdmDatabaseName, - cohortId = cohortId, - outcomeId = oi, - minCellCount = minCellCount - ) - - proportion <- unique(rbind(proportion, proportionTemp)) - - characterizationTemp <- covariateSummary( - covariateData = plpData$covariateData, - cohort = population %>% dplyr::select(.data$rowId), - labels = population %>% dplyr::select(.data$rowId, .data$outcomeCount) - ) - - - characterizationTemp <- characterizationTemp[,c('covariateId', - 'covariateName', - 'CovariateCount', - 'WithOutcome_CovariateCount', - 'WithNoOutcome_CovariateCount', - 'WithOutcome_CovariateMean', - 'WithNoOutcome_CovariateMean')] - - characterizationTemp[is.na(characterizationTemp)] <- 0 - - ind <- (characterizationTemp$CovariateCount < minCellCount) - ind2 <- (characterizationTemp$WithOutcome_CovariateCount < minCellCount) | (characterizationTemp$WithNoOutcome_CovariateCount < minCellCount) - - characterizationTemp[ind,'CovariateCount'] <- -1 - characterizationTemp[ind,'WithOutcome_CovariateCount'] <- -1 - characterizationTemp[ind,'WithNoOutcome_CovariateCount'] <- -1 - characterizationTemp[ind,'WithOutcome_CovariateMean'] <- -1 - characterizationTemp[ind,'WithNoOutcome_CovariateMean'] <- -1 - - characterizationTemp[ind2,'WithOutcome_CovariateCount'] <- -1 - characterizationTemp[ind2,'WithNoOutcome_CovariateCount'] <- -1 - characterizationTemp[ind2,'WithOutcome_CovariateMean'] <- -1 - characterizationTemp[ind2,'WithNoOutcome_CovariateMean'] <- -1 - - # add analysisId - characterizationTemp$analysisId <- analysisId - characterization <- rbind(characterization, characterizationTemp) - } - - } - - if(!is.null(outputFolder)){ - utils::write.csv(proportion, file.path(outputFolder, 'proportion.csv'), row.names = F) - utils::write.csv(characterization, file.path(outputFolder, 'characterization.csv'), row.names = F) - } - - # Add all to zip file ------------------------------------------------------------------------------- - ParallelLogger::logInfo("Adding results to zip file") - zipName <- file.path(outputFolder, paste0("Results_", cdmDatabaseName, ".zip")) - files <- list.files(outputFolder, pattern = ".*\\.csv$") - oldWd <- setwd(outputFolder) - on.exit(setwd(oldWd), add = TRUE) - DatabaseConnector::createZipFile(zipFile = zipName, files = files) - ParallelLogger::logInfo("Results are ready for sharing at: ", zipName) - - result <- list(distribution = distribution, - proportion = proportion, - characterization = characterization, - survival = surv) - - return(result) -} - - -getSurvival <- function(plpData, outcomeId, cohortId, cdmDatabaseName ){ - - object <- plpData$outcomes %>% - dplyr::filter(.data$outcomeId == !!outcomeId) %>% - dplyr::right_join(plpData$cohorts, by ='rowId') %>% - dplyr::group_by(.data$rowId) %>% - dplyr::summarise(daysToObsEnd = min(.data$daysToObsEnd), - daysToEvent = min(.data$daysToEvent)) - - - object$censoredTime <- apply(object[,-1], 1, function(x) min(x, na.rm = T)) - object$event <- 0 - object$event[!is.na(object$daysToEvent)] <- ifelse(object$event[!is.na(object$daysToEvent)] <= object$censoredTime[!is.na(object$daysToEvent)], 1,0) - - - result <- object %>% dplyr::group_by(.data$censoredTime) %>% - dplyr::summarise(events = sum(.data$event), - censored = length(.data$event)-sum(.data$event)) - - totalCensored <- lapply(unique(object$censoredTime), function(i) sum(result %>% dplyr::filter(.data$censoredTime <= i) %>% dplyr::select(.data$censored))) - - totalCensored <- data.frame(censoredTime = unique(object$censoredTime), - totalCensored = unlist(totalCensored)) - - totalLost <- lapply(unique(object$censoredTime), function(i) sum(result %>% dplyr::filter(.data$censoredTime <= i) %>% dplyr::mutate(lost = .data$censored + .data$events) %>% dplyr::select(.data$lost))) - totalLost <- data.frame(censoredTime = unique(object$censoredTime), - nAtRisk = nrow(plpData$cohorts) - unlist(totalLost)) - - result <- result %>% - dplyr::left_join(totalCensored, by ='censoredTime') %>% - dplyr::left_join(totalLost, by ='censoredTime') - - result$outcomeId <- outcomeId - result$cohortId <- cohortId - result$cdmDatabaseName <- cdmDatabaseName - return(result) -} - - -getDistribution <- function(cohort, - outcomes, - outputFolder = NULL, - databaseName){ - - cohortId <- unique(cohort$cohortId) - outcomesIds <- unique(outcomes$outcomeId) - - if(file.exists(file.path(outputFolder, 'distribution.csv'))){ - result <- utils::read.csv(file.path(outputFolder, 'distribution.csv')) - } else{ - result <- c() - } - - for(i in 1:length(outcomesIds)){ - oi <- outcomesIds[i] - ind <- outcomes$outcomeId==oi & outcomes$daysToEvent >= 0 - if(sum(ind)>0){ - afterC <- stats::aggregate(x = outcomes$daysToEvent[ind], - by = list(outcomes$rowId[ind]), - FUN = min) - colnames(afterC) <- c('rowId','daysToOutcomeAfterMin') - } else { - afterC <- data.frame(rowId = -1, daysToOutcomeAfterMin = 0) - } - - - ind <- outcomes$outcomeId==oi & outcomes$daysToEvent < 0 - if(sum(ind)>0){ - beforeC <- stats::aggregate(x = abs(outcomes$daysToEvent[ind]), - by = list(outcomes$rowId[ind]), - FUN = min) - colnames(beforeC) <- c('rowId','daysToOutcomeBeforeMin') - } else { - beforeC <- data.frame(rowId = -1, daysToOutcomeBeforeMin = 0) - } - - tempResult <- merge(cohort, afterC, by='rowId', all.x = T) - tempResult <- merge(tempResult, beforeC, by='rowId', all.x = T) - - tempResult <- processDistribution(tempResult) - - tempResult$databaseName <- databaseName - tempResult$outcomeId <- oi - tempResult$targetId <- cohortId - - result <- unique(rbind(result, tempResult)) - - } - - if(!is.null(outputFolder)){ - utils::write.csv(result, file.path(outputFolder, 'distribution.csv'), row.names = F) - } - - return(result) -} - - -processDistribution <- function(distribution){ - - distribution$year <- format(as.Date(as.character(distribution$cohortStartDate), format="%Y-%m-%d"),"%Y") - distribution <- distribution[, c('year','daysFromObsStart','daysToObsEnd','daysToOutcomeAfterMin','daysToOutcomeBeforeMin')] - results <- do.call(rbind, lapply(c('all',unique(distribution$year)), function(x) getQuantiles(distribution, x) )) - return(results) -} - -getQuantiles <- function(distribution, year= 'all'){ - - if(year != 'all'){ - distribution <- distribution[distribution$year==year,] - } - quants <- data.frame( - year = year, - daysFromObsStart = stats::quantile(distribution$daysFromObsStart, seq(0,1,0.01)), - daysToObsEnd = stats::quantile(distribution$daysToObsEnd, seq(0,1,0.01)), - daysToOutcomeAfterMin = stats::quantile(distribution$daysToOutcomeAfterMin[!is.na(distribution$daysToOutcomeAfterMin)], seq(0,1,0.01)), - daysToOutcomeBeforeMin = stats::quantile(distribution$daysToOutcomeBeforeMin[!is.na(distribution$daysToOutcomeBeforeMin)], seq(0,1,0.01)) - ) - heading <- data.frame( - year = year, - daysFromObsStart =length(distribution$daysFromObsStart), - daysToObsEnd = length(distribution$daysToObsEnd), - daysToOutcomeAfterMin = sum(!is.na(distribution$daysToOutcomeAfterMin)), - daysToOutcomeBeforeMin = sum(!is.na(distribution$daysToOutcomeBeforeMin)) - ) - results <- rbind(N = heading, quants) - results$type = rownames(results) - rownames(results) <- NULL - return(results) -} - - - -getAnalysisId <- function(settings, - cohortId, - outcomeId, - riskWindowStart, - startAnchor, - riskWindowEnd, - endAnchor){ - - ind <- (settings$cohortId == cohortId) & (settings$outcomeId == outcomeId) & - (settings$riskWindowStart == riskWindowStart) & (settings$riskWindowEnd == riskWindowEnd) & - (settings$startAnchor == startAnchor) & (settings$endAnchor == endAnchor) - if(sum(ind)==0){ - writeLines(paste('cohortId:',cohortId, '-outcomeId:',outcomeId, - '-riskWindowStart:', riskWindowStart, '-riskWindowEnd:', riskWindowEnd, - '-startAnchor:', startAnchor, '-endAnchor:',endAnchor)) - print(settings) - stop('No analysis id found for the settings') - } else { - return(settings$analysisId[ind][1]) - } -} - - - -getProportions <- function(population, - analysisId, - cdmDatabaseName, - cohortId, - outcomeId, - minCellCount = NULL){ - - details <- attr(population, 'metaData')$populationSettings - - TAR <- paste0(details$startAnchor, ' + ', details$riskWindowStart, ' days - ', - details$endAnchor, ' + ', details$riskWindowEnd, ' days') - - - result <- population %>% dplyr::mutate(ageGroup = paste0(floor(.data$ageYear/5)*5 ,' - ', (floor(.data$ageYear/5)+1)*5-1 ), - year = substring(.data$cohortStartDate,1,4)) %>% - dplyr::group_by(.data$year, .data$ageGroup, .data$gender) %>% - dplyr::summarize(N = length(.data$rowId), - O = sum(.data$outcomeCount>0) - ) %>% - dplyr::select(.data$year, .data$ageGroup, .data$gender, .data$N, .data$O) - - # add all years: - allYears <- result %>% dplyr::group_by(.data$ageGroup, .data$gender) %>% - dplyr::summarize(N = sum(.data$N), - O = sum(.data$O), - year = 'all' - ) %>% dplyr::select(.data$year, .data$ageGroup, .data$gender, .data$N, .data$O) - # add all gender: - allGender <- result %>% dplyr::group_by(.data$year, .data$ageGroup) %>% - dplyr::summarize(N = sum(.data$N), - O = sum(.data$O), - gender = -1 - ) %>% dplyr::select(.data$year, .data$ageGroup, .data$gender, .data$N, .data$O) - - # add all gender: - allAge <- result %>% dplyr::group_by(.data$year, .data$gender) %>% - dplyr::summarize(N = sum(.data$N), - O = sum(.data$O), - ageGroup = 'all' - ) %>% dplyr::select(.data$year, .data$ageGroup, .data$gender, .data$N, .data$O) - - result <- rbind(result, allYears, allGender, allAge) - - result$opercent <- result$O/result$N*100 - - # censor - if(!is.null(minCellCount)){ - result$opercent[result$O < minCellCount] <- -1 - result$N[result$N0 & predictionOfInterest$value < 1 - valProb <- tryCatch(rms::val.prob(predictionOfInterest$value[indValProb], predictionOfInterest$outcomeCount[indValProb]), - error = function(e){ParallelLogger::logInfo(e); return(c(Eavg = 0, - E90 = 0, - Emax = 0))}) + valProb <- tryCatch( + calculateEStatisticsBinary(prediction = predictionOfInterest[indValProb, ]), + error = function(e) { + ParallelLogger::logInfo(e); return( + c( + Eavg = 0, + E90 = 0, + Emax = 0 + ) + ) + } + ) result <- rbind( result, c(evalType, 'Eavg', valProb['Eavg']), @@ -85,6 +94,8 @@ getEvaluationStatistics_binary <- function(prediction, evalColumn, ...){ c(evalType, 'Emax', valProb['Emax']) ) ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Eavg: ', round(valProb['Eavg'], digits = 4))) + + # Removing for now as too slow... @@ -213,28 +224,48 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, c(evalType, timepoint, 'C-statistic upper 95% CI', cStatistic_u95CI) ) ParallelLogger::logInfo(paste0('C-statistic: ',cStatistic, ' (',cStatistic_l95CI ,'-', cStatistic_u95CI ,')')) - - + # add e-stat - w <- tryCatch( - { - rms::val.surv( - est.surv = 1-p, - S = S, - u = timepoint, - fun = function(pr)log(-log(pr)) + + .validateSurvival <- function(p, S, timepoint) { + estimatedSurvival <- 1 - p + notMissing <- !is.na(estimatedSurvival + S[, 1] + S[, 2]) + estimatedSurvival <- estimatedSurvival[notMissing] + S <- S[notMissing, ] + .curtail <- function(x) pmin(.9999, pmax(x, .0001)) + f <- polspline::hare( + S[, 1], + S[, 2], + log(-log((.curtail(estimatedSurvival)))), + maxdim = 5 + ) + actual <- 1 - polspline::phare(timepoint, log(-log(estimatedSurvival)), f) + + return( + list( + actual = actual, + estimatedSurvival = estimatedSurvival ) - }, - error = function(e){ParallelLogger::logError(e); return(NULL)} + ) + } + + w <- tryCatch( + { + .validateSurvival( + p = p, + S = S, + timepoint = timepoint + ) + }, + error = function(e){ParallelLogger::logError(e); return(NULL)} ) - - eStatistic <- -1 - eStatistic90 <- -1 - if(!is.null(w)){ - eStatistic<-mean(abs(w$actual - w$p)) - eStatistic90<-stats::quantile((abs(w$actual - w$p)),0.9) - + + eStatistic <- eStatistic90 <- -1 + if (!is.null(w)) { + eStatistic <- mean(abs(w$actual - w$estimatedSurvival)) + eStatistic90 <- stats::quantile(abs(w$actual - w$estimatedSurvival), probs = .9) } + result <- rbind( result, c(evalType, timepoint, 'E-statistic', eStatistic), @@ -251,6 +282,28 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, } +calculateEStatisticsBinary <- function(prediction) { + risk <- prediction$value + outcome <- prediction$outcomeCount + notna <- ! is.na(risk + outcome) + risk <- risk[notna] + outcome <- outcome[notna] + smoothFit <- stats::lowess(risk, outcome, iter = 0) + smoothCalibration <- stats::approx(smoothFit, xout = risk, ties = mean)$y + distance <- abs(risk - smoothCalibration) + eavg <- mean(abs(risk - smoothCalibration)) + emax <- max(distance) + e90 <- stats::quantile(distance, probs = .9) + names(e90) <- NULL + return( + c( + Eavg = eavg, + E90 = e90, + Emax = emax + ) + ) +} + #================================== # Fucntions for the summary @@ -280,13 +333,13 @@ computeAuc <- function(prediction, } aucWithCi <- function(prediction, truth){ - auc <- pROC::auc(as.factor(truth), prediction, direction="<") + auc <- pROC::auc(as.factor(truth), prediction, direction="<", quiet=TRUE) aucci <-pROC::ci(auc) return(data.frame(auc = aucci[2], auc_lb95ci = aucci[1], auc_ub95ci = aucci[3])) } aucWithoutCi <- function(prediction, truth){ - auc <- pROC::auc(as.factor(truth), prediction, direction="<") + auc <- pROC::auc(as.factor(truth), prediction, direction="<", quiet=TRUE) return(as.double(auc)) } diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R index 84ca205ce..8e3e7a64a 100644 --- a/R/ExternalValidatePlp.R +++ b/R/ExternalValidatePlp.R @@ -17,65 +17,68 @@ # limitations under the License. -externalValidatePlp <- function( - plpModel, - plpData, - databaseName = 'database 1', - population, - settings = createValidationSettings( # add covariateSummary option? - recalibrate = 'weakRecalibration' - ) -){ - - # Apply model +externalValidatePlp <- function(plpModel, + plpData, + population, + settings = list(# add covariateSummary option? + recalibrate = 'weakRecalibration')) { + # Apply model #======= prediction <- tryCatch({ - predictPlp( - plpModel = plpModel, - plpData = plpData, - population = population - )}, - error = function(e){ParallelLogger::logError(e)} - ) + predictPlp(plpModel = plpModel, + plpData = plpData, + population = population) + }, + error = function(e) { + ParallelLogger::logError(e) + }) prediction$evaluationType <- 'Validation' # Recalibrate #======= - if(!is.null(settings$recalibrate)){ - for(recalMethod in settings$recalibrate){ - prediction <- tryCatch({recalibratePlp(prediction = prediction, method = recalMethod) - }, - error = function(e){ParallelLogger::logError(e)} - ) + if (!is.null(settings$recalibrate)) { + for (recalMethod in settings$recalibrate) { + prediction <- + tryCatch({ + recalibratePlp(prediction = prediction, method = recalMethod) + }, + error = function(e) { + ParallelLogger::logError(e) + }) } } - # Evaluate + # Evaluate #======= performance <- tryCatch({ evaluatePlp(prediction = prediction, typeColumn = 'evaluationType') }, - error = function(e){ParallelLogger::logError(e)} - ) + error = function(e) { + ParallelLogger::logError(e) + }) # step 6: covariate summary labels <- tryCatch({ - population %>% dplyr::select(.data$rowId, .data$outcomeCount) + population %>% dplyr::select("rowId", "outcomeCount") }, - error = function(e){ return(NULL) } - ) + error = function(e) { + return(NULL) + }) - if(settings$runCovariateSummary){ - covariateSum <- tryCatch({ - covariateSummary( - covariateData = plpData$covariateData, - cohort = population[,colnames(population) != 'outcomeCount'], - labels = labels, - variableImportance = plpModel$covariateImportance %>% dplyr::select(.data$covariateId, .data$covariateValue) - )}, - error = function(e){ParallelLogger::logInfo(e); return(NULL) } - ) + if (settings$runCovariateSummary) { + covariateSum <- tryCatch({ + covariateSummary( + covariateData = plpData$covariateData, + cohort = population[, colnames(population) != 'outcomeCount'], + labels = labels, + variableImportance = plpModel$covariateImportance %>% dplyr::select("covariateId", "covariateValue") + ) + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(NULL) + }) } else{ covariateSum <- NULL } @@ -83,10 +86,10 @@ externalValidatePlp <- function( executionSummary <- list( ExecutionDateTime = Sys.Date(), PackageVersion = list( - rVersion= R.Version()$version.string, + rVersion = R.Version()$version.string, packageVersion = utils::packageVersion("PatientLevelPrediction") ), - PlatformDetails= list( + PlatformDetails = list( platform = R.Version()$platform, cores = Sys.getenv('NUMBER_OF_PROCESSORS'), RAM = memuse::Sys.meminfo()[1] @@ -95,16 +98,21 @@ externalValidatePlp <- function( model = list( model = 'external validation of model', - settings = plpModel$settings, + modelDesign = plpModel$modelDesign, + # was settings validationDetails = list( - analysisId = '', #TODO add from model - analysisSource = '', #TODO add from model - developmentDatabase = plpModel$trainDetails$cdmDatabaseSchema, - cdmDatabaseSchema = databaseName, + analysisId = '', + #TODO add from model + analysisSource = '', + #TODO add from model + developmentDatabase = plpModel$trainDetails$developmentDatabase, + developmentDatabaseId = plpModel$trainDetails$developmentDatabaseId, + validationDatabase = plpData$metaData$databaseDetails$cdmDatabaseSchema, + validationDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId, populationSettings = attr(population, 'metaData')$populationSettings, - plpDataSettings = attr(plpData, 'metaData')$restrictPlpDataSettings, + restrictPlpDataSettings = plpData$metaData$restrictPlpDataSettings, outcomeId = attr(population, 'metaData')$outcomeId, - cohortId = attr(plpData, 'metaData')$databaseDetails$cohortId, + targetId = plpData$metaData$databaseDetails$targetId, attrition = attr(population, 'metaData')$attrition, validationDate = Sys.Date() # is this needed? ) @@ -132,61 +140,62 @@ externalValidatePlp <- function( #' @description #' This function extracts data using a user specified connection and cdm_schema, applied the model and then calcualtes the performance #' @details -#' Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the +#' Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the #' number of cdm_schemas input with the performance on the new data -#' +#' #' @param plpModel The model object returned by runPlp() containing the trained model #' @param validationDatabaseDetails A list of objects of class \code{databaseDetails} created using \code{createDatabaseDetails} #' @param validationRestrictPlpDataSettings A list of population restriction settings created by \code{createRestrictPlpDataSettings()} #' @param settings A settings object of class \code{validationSettings} created using \code{createValidationSettings} -#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} -#' specifying how the logging is done +#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} +#' specifying how the logging is done #' @param outputFolder The directory to save the validation results to (subfolders are created per database in validationDatabaseDetails) -#' +#' #' @return -#' A list containing the performance for each validation_schema +#' A list containing the performance for each validation_schema #' #' #' @export -externalValidateDbPlp <- function( - plpModel, - validationDatabaseDetails = createDatabaseDetails(), - validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), - settings = createValidationSettings( - recalibrate = 'weakRecalibration' - ), - logSettings = createLogSettings(verbosity = 'INFO', logName = 'validatePLP'), - outputFolder = getwd() -){ - - # Input checks +externalValidateDbPlp <- function(plpModel, + validationDatabaseDetails = createDatabaseDetails(), + validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), + settings = createValidationSettings(recalibrate = 'weakRecalibration'), + logSettings = createLogSettings(verbosity = 'INFO', logName = 'validatePLP'), + outputFolder = getwd()) { + # Input checks #======= checkIsClass(plpModel, 'plpModel') # check the class and make a list if a single database setting - if(class(validationDatabaseDetails) == 'list'){ - lapply(validationDatabaseDetails, function(x) checkIsClass(x, 'databaseDetails')) + if (inherits(validationDatabaseDetails, 'list')) { + lapply(validationDatabaseDetails, function(x) + checkIsClass(x, 'databaseDetails')) } else{ checkIsClass(validationDatabaseDetails, 'databaseDetails') validationDatabaseDetails <- list(validationDatabaseDetails) } - checkIsClass(validationRestrictPlpDataSettings, 'restrictPlpDataSettings') + checkIsClass(validationRestrictPlpDataSettings, + 'restrictPlpDataSettings') checkIsClass(settings, 'validationSettings') - + # create results list with the names of the databases to validate across result <- list() length(result) <- length(validationDatabaseDetails) - names(result) <- unlist(lapply(validationDatabaseDetails, function(x) attr(x, 'cdmDatabaseName'))) + names(result) <- + unlist(lapply(validationDatabaseDetails, function(x) + attr(x, 'cdmDatabaseName'))) - for(databaseDetails in validationDatabaseDetails){ - + for (databaseDetails in validationDatabaseDetails) { databaseName <- attr(databaseDetails, 'cdmDatabaseName') # initiate log - logSettings$saveDirectory <- file.path(outputFolder, databaseName, plpModel$trainDetails$analysisId) + logSettings$saveDirectory <- + file.path(outputFolder, + databaseName, + plpModel$trainDetails$analysisId) logSettings$logFileName <- 'validationLog' - logger <- do.call(createLog,logSettings) + logger <- do.call(createLog, logSettings) ParallelLogger::registerLogger(logger) ParallelLogger::logInfo(paste('Validating model on', databaseName)) @@ -194,87 +203,113 @@ externalValidateDbPlp <- function( # Step 1: get data #======= - getPlpDataSettings <- list( - databaseDetails = databaseDetails, - restrictPlpDataSettings = validationRestrictPlpDataSettings - ) - if(is.null(getPlpDataSettings$databaseDetails$cohortId)){ - ParallelLogger::logInfo("cohortId not in databaseSettings so using model's") - getPlpDataSettings$databaseDetails$cohortId <- plpModel$trainDetails$cohortId + getPlpDataSettings <- list(databaseDetails = databaseDetails, + restrictPlpDataSettings = validationRestrictPlpDataSettings) + if (is.null(getPlpDataSettings$databaseDetails$targetId)) { + ParallelLogger::logInfo("targetId not in databaseSettings so using model's") + getPlpDataSettings$databaseDetails$targetId <- + plpModel$modelDesign$targetId } - if(is.null(getPlpDataSettings$databaseDetails$outcomeIds)){ - ParallelLogger::logInfo("cohortId not in databaseSettings so using model's") - getPlpDataSettings$databaseDetails$outcomeIds <- plpModel$trainDetails$outcomeId + if (is.null(getPlpDataSettings$databaseDetails$outcomeIds)) { + ParallelLogger::logInfo("outcomeId not in databaseSettings so using model's") + getPlpDataSettings$databaseDetails$outcomeIds <- + plpModel$modelDesign$outcomeId } - if(is.null(getPlpDataSettings$restrictPlpDataSettings$firstExposureOnly)){ + if (is.null(getPlpDataSettings$restrictPlpDataSettings$firstExposureOnly)) { ParallelLogger::logInfo("firstExposureOnly not in restrictPlpDataSettings so using model's") - getPlpDataSettings$restrictPlpDataSettings$firstExposureOnly <- plpModel$settings$plpDataSettings$firstExposureOnly + getPlpDataSettings$restrictPlpDataSettings$firstExposureOnly <- + plpModel$modelDesign$restrictPlpDataSettings$firstExposureOnly } - if(is.null(getPlpDataSettings$restrictPlpDataSettings$washoutPeriod)){ + if (is.null(getPlpDataSettings$restrictPlpDataSettings$washoutPeriod)) { ParallelLogger::logInfo("washoutPeriod not in restrictPlpDataSettings so using model's") - getPlpDataSettings$restrictPlpDataSettings$washoutPeriod <- plpModel$settings$plpDataSettings$washoutPeriod + getPlpDataSettings$restrictPlpDataSettings$washoutPeriod <- + plpModel$modelDesign$restrictPlpDataSettings$washoutPeriod } - # we need to update this to restrict to model covariates and update custom features - getPlpDataSettings$covariateSettings <- plpModel$settings$covariateSettings + # TODO: we need to update this to restrict to model covariates and update custom features + getPlpDataSettings$covariateSettings <- + plpModel$modelDesign$covariateSettings plpData <- tryCatch({ do.call(getPlpData, getPlpDataSettings) }, - error = function(e){ParallelLogger::logError(e); return(NULL)} - ) + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + }) - if(is.null(plpData)){ + if (is.null(plpData)) { closeLog(logger) } - # Step 2: create population + # Step 2: create population #======= population <- tryCatch({ do.call( - createStudyPopulation, + createStudyPopulation, list( plpData = plpData, outcomeId = getPlpDataSettings$databaseDetails$outcomeIds, - populationSettings = plpModel$settings$populationSettings + populationSettings = plpModel$modelDesign$populationSettings ) ) }, - error = function(e){ParallelLogger::logError(e); return(NULL)} - ) + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + }) - if(is.null(population)){ + if (is.null(population)) { closeLog(logger) } # Step 3: Apply model to plpData and population #======= - + result[[databaseName]] <- tryCatch({ - externalValidatePlp( - plpModel, - plpData, - databaseName = databaseName, - population, - settings = settings - )}, - error = function(e){ParallelLogger::logInfo(e); return(NULL)} - ) + externalValidatePlp(plpModel, + plpData, + #databaseName = databaseName, + population, + settings = settings) + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(NULL) + }) - if(is.null(result[[databaseName]])){ + if (is.null(result[[databaseName]])) { closeLog(logger) } else{ + if (!dir.exists(file.path( + outputFolder, + databaseName, + plpModel$trainDetails$analysisId + ))) { + dir.create( + file.path( + outputFolder, + databaseName, + plpModel$trainDetails$analysisId + ), + recursive = T + ) + } - if(!dir.exists(file.path(outputFolder, databaseName, plpModel$trainDetails$analysisId))){ - dir.create(file.path(outputFolder, databaseName, plpModel$trainDetails$analysisId), recursive = T) - } - - savePlpResult(result[[databaseName]], dirPath = file.path(outputFolder, databaseName, plpModel$trainDetails$analysisId, 'validationResult')) + savePlpResult( + result[[databaseName]], + dirPath = file.path( + outputFolder, + databaseName, + plpModel$trainDetails$analysisId, + 'validationResult' + ) + ) } } - + # Now return results return(invisible(result)) } @@ -286,29 +321,247 @@ externalValidateDbPlp <- function( #' This function creates the settings required by externalValidatePlp #' @details #' Users need to specify whether they want to sample or recalibate when performing external validation -#' -#' @param recalibrate A vector of characters specifying the recalibration method to apply +#' +#' @param recalibrate A vector of characters specifying the recalibration method to apply #' @param runCovariateSummary Whether to run the covariate summary for the validation data #' @return #' A setting object of class \code{validationSettings} containing a list of settings for externalValidatePlp #' #' @export -createValidationSettings <- function( - recalibrate = NULL, - runCovariateSummary = T -){ - - checkIsClass(recalibrate, c('character','NULL')) - if(!is.null(recalibrate)){ - if(sum(recalibrate %in% c('recalibrationInTheLarge', 'weakRecalibration'))!=length(recalibrate)){ - ParallelLogger::logError('Incorrect recalibrate options used. Must be recalibrationInTheLarge or weakRecalibration') +createValidationSettings <- function(recalibrate = NULL, + runCovariateSummary = T) { + checkIsClass(recalibrate, c('character', 'NULL')) + if (!is.null(recalibrate)) { + if (sum(recalibrate %in% c('recalibrationInTheLarge', 'weakRecalibration')) != + length(recalibrate)) { + ParallelLogger::logError( + 'Incorrect recalibrate options used. Must be recalibrationInTheLarge or weakRecalibration' + ) } } - - result = list( - recalibrate = recalibrate, - runCovariateSummary = runCovariateSummary - ) + + result <- list(recalibrate = recalibrate, + runCovariateSummary = runCovariateSummary) class(result) <- 'validationSettings' return(result) } + +#' createValidationDesign - Define the validation design for external validation +#' +#' @param targetId The targetId of the target cohort to validate on +#' @param outcomeId The outcomeId of the outcome cohort to validate on +#' @param populationSettings A list of population restriction settings created by \code{createPopulationSettings} +#' @param restrictPlpDataSettings A list of plpData restriction settings created by \code{createRestrictPlpDataSettings} +#' @param plpModelList A list of plpModels objects created by \code{runPlp} or a path to such objects +#' @param recalibrate A vector of characters specifying the recalibration method to apply, +#' @param runCovariateSummary whether to run the covariate summary for the validation data +#' @export +createValidationDesign <- + function(targetId, + outcomeId, + populationSettings, + restrictPlpDataSettings, + plpModelList, + recalibrate = NULL, + runCovariateSummary = TRUE) { + checkIsClass(targetId, c("numeric", "integer")) + checkIsClass(outcomeId, c("numeric", "integer")) + checkIsClass(populationSettings, c("populationSettings")) + checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings") + checkIsClass(plpModelList, "list") + lapply(plpModelList, function(x) + checkIsClass(x, c("plpModel", "character"))) + checkIsClass(recalibrate, c("character", "NULL")) + checkIsClass(runCovariateSummary, "logical") + + design <- list( + targetId = targetId, + outcomeId = outcomeId, + populationSettings = populationSettings, + plpModelList = plpModelList, + restrictPlpDataSettings = restrictPlpDataSettings, + recalibrate = recalibrate, + runCovariateSummary = runCovariateSummary + ) + class(design) <- "validationDesign" + return(design) + } + + +#' externalValidatePlp - Validate model performance on new data +#' +#' @param validationDesignList A list of objects created with \code{createValidationDesign} +#' @param databaseDetails A list of objects of class +#' \code{databaseDetails} created using \code{createDatabaseDetails} +#' @param logSettings An object of \code{logSettings} created +#' using \code{createLogSettings} +#' @param outputFolder The directory to save the validation results to +#' (subfolders are created per database in validationDatabaseDetails) +#' @export +validateExternal <- function(validationDesignList, + databaseDetails, + logSettings, + outputFolder) { + # Input checks + #======= + if (inherits(validationDesignList, 'list')) { + lapply(validationDesignList, function(x) + checkIsClass(x, 'validationDesign')) + } else { + checkIsClass(validationDesignList, 'validationDesign') + validationDesignList <- list(validationDesignList) + } + + # check the class and make a list if a single database setting + if (inherits(databaseDetails, 'list')) { + lapply(databaseDetails, function(x) + checkIsClass(x, 'databaseDetails')) + } else { + checkIsClass(databaseDetails, 'databaseDetails') + databaseDetails <- list(databaseDetails) + } + + # create results list with the names of the databases to validate across + result <- list() + length(result) <- length(databaseDetails) + names(result) <- + unlist(lapply(databaseDetails, function(x) + attr(x, 'cdmDatabaseName'))) + + # Need to keep track of incremental analysisId's for each database + databaseNames <- unlist(lapply(databaseDetails, function(x) + x$cdmDatabaseName)) + analysisInfo <- list() + for (name in databaseNames) { + analysisInfo[name] <- 1 + } + for (design in validationDesignList) { + for (database in databaseDetails) { + databaseName <- database$cdmDatabaseName + # initiate log + logSettings$saveDirectory <- file.path(outputFolder, + database$cdmDatabaseName) + logSettings$logFileName <- 'validationLog' + logger <- do.call(createLog, logSettings) + ParallelLogger::registerLogger(logger) + on.exit(logger$close()) + + ParallelLogger::logInfo(paste('Validating model on', database$cdmDatabaseName)) + + database$targetId <- design$targetId + + database$outcomeIds <- design$outcomeId + + allCovSettings <- + lapply(design$plpModelList, function(plpModel) + plpModel$modelDesign$covariateSettings) + # compare all to first covSettings, if not the same stop + if (!Reduce(function(x, y) + x && + identical(y, allCovSettings[[1]]), + allCovSettings[-1], + init = TRUE)) { + stop("covariateSettings are not the same across models which is not supported yet") + } + plpData <- tryCatch({ + do.call( + getPlpData, + list( + databaseDetails = database, + restrictPlpDataSettings = design$restrictPlpDataSettings, + covariateSettings = design$plpModelList[[1]]$modelDesign$covariateSettings + ) + ) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + }) + plpDataName <- + paste0("targetId_", design$targetId, "_L", "1") # Is the 1 for how many targetIds in file ? + plpDataLocation <- + file.path(outputFolder, databaseName, plpDataName) + if (!dir.exists(file.path(outputFolder, databaseName))) { + dir.create(file.path(outputFolder, databaseName), recursive = TRUE) + } + savePlpData(plpData, file = plpDataLocation) + + # create study population + population <- tryCatch({ + do.call( + createStudyPopulation, + list( + plpData = plpData, + outcomeId = design$outcomeId, + populationSettings = design$populationSettings + ) + ) + }, + error = function(e) { + ParallelLogger::logError(e) + return(NULL) + }) + + results <- lapply(design$plpModelList, function(model) { + analysisName <- paste0("Analysis_", analysisInfo[databaseName]) + validateModel( + plpModel = model, + plpData = plpData, + population = population, + recalibrate = design$recalibrate, + runCovariateSummary = design$runCovariateSummary, + outputFolder = outputFolder, + databaseName = databaseName, + analysisName = analysisName + ) + analysisInfo[[databaseName]] <<- analysisInfo[[databaseName]] + 1 + }) + } + } + for (database in databaseDetails) { + databaseName <- database$cdmDatabaseName + sqliteLocation <- + file.path(outputFolder, 'sqlite') + tryCatch({ + insertResultsToSqlite( + resultLocation = file.path(outputFolder, databaseName), + cohortDefinitions = NULL, + databaseList = createDatabaseList( + cdmDatabaseSchemas = database$cdmDatabaseSchema, + cdmDatabaseNames = database$cdmDatabaseName, + databaseRefIds = database$cdmDatabaseId + ), + sqliteLocation = sqliteLocation + ) + }, + error = function(e) ParallelLogger::logInfo(e) + ) + } + return(invisible(results)) +} + +validateModel <- + function(plpModel, + plpData, + population, + recalibrate, + runCovariateSummary, + outputFolder, + databaseName, + analysisName) { + result <- externalValidatePlp( + plpModel = plpModel, + plpData = plpData, + population = population, + settings = list(recalibrate = recalibrate, + runCovariateSummary = runCovariateSummary) + ) + savePlpResult(result, + dirPath = file.path( + outputFolder, + databaseName, + analysisName, + 'validationResult' + )) + return(result) +} diff --git a/R/ExtractData.R b/R/ExtractData.R index e7a0d8d8c..e754eb802 100644 --- a/R/ExtractData.R +++ b/R/ExtractData.R @@ -85,7 +85,8 @@ createRestrictPlpDataSettings <- function( #' instance. Requires read permissions to this database. On SQL #' Server, this should specifiy both the database and the schema, #' so for example 'cdm_instance.dbo'. -#' @param cdmDatabaseName A string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported) +#' @param cdmDatabaseName A string with the name of the database - this is used in the shiny app and when externally validating models to name the result list and to specify the folder name when saving validation results (defaults to cdmDatabaseSchema if not specified) +#' @param cdmDatabaseId A string with a unique identifier for the database and version - this is stored in the plp object for future reference and used by the shiny app (defaults to cdmDatabaseSchema if not specified) #' @param tempEmulationSchema For dmbs like Oracle only: the name of the database schema where you #' want all temporary tables to be managed. Requires #' create/insert permissions to this database. @@ -101,10 +102,11 @@ createRestrictPlpDataSettings <- function( #' @param outcomeTable The tablename that contains the outcome cohorts. Expectation is #' outcomeTable has format of COHORT table: COHORT_DEFINITION_ID, #' SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE. -#' @param cohortId An integer specifying the cohort id for the target cohort +#' @param targetId An integer specifying the cohort id for the target cohort #' @param outcomeIds A single integer or vector of integers specifying the cohort ids for the outcome cohorts -#' @param cdmVersion Define the OMOP CDM version used: currently support "4" and -#' "5". +#' @param cdmVersion Define the OMOP CDM version used: currently support "4" and "5". +#' @param cohortId (depreciated: use targetId) old input for the target cohort id +#' #' @return #' A list with the the database specific settings (this is used by the runMultiplePlp function and the skeleton packages) #' @@ -113,25 +115,55 @@ createDatabaseDetails <- function( connectionDetails, cdmDatabaseSchema, cdmDatabaseName, + cdmDatabaseId, # added for strategus tempEmulationSchema = cdmDatabaseSchema, cohortDatabaseSchema = cdmDatabaseSchema, cohortTable = "cohort", outcomeDatabaseSchema = cdmDatabaseSchema, outcomeTable = "cohort", - cohortId = NULL, + targetId = NULL, outcomeIds = NULL, - cdmVersion = 5 + cdmVersion = 5, + cohortId = NULL ){ + + if(is.null(targetId)){ + if(!is.null(cohortId)){ + ParallelLogger::logWarn('cohortId has been depreciated. Please use targetId.') + targetId <- cohortId + } + } + + if(missing(cdmDatabaseName)){ + ParallelLogger::logInfo('No cdm database name entered so using cdmDatabaseSchema') + cdmDatabaseName <- removeInvalidString(cdmDatabaseSchema) + } + if(missing(cdmDatabaseId)){ + ParallelLogger::logInfo('No cdm database id entered so using cdmDatabaseSchema - if cdmDatabaseSchema is the same for multiple different databases, please use cdmDatabaseId to specify a unique identifier for the database and version') + cdmDatabaseId <- removeInvalidString(cdmDatabaseSchema) + } + + if(length(cdmDatabaseId) == 0 ){ + stop('cdmDatabaseId must be a string with length > 0') + } + + # check to make sure cdmDatabaseId is not an int as that will cause issues + if(!inherits(cdmDatabaseId, 'character')){ + ParallelLogger::logInfo('cdmDatabaseId is not a string - this will cause issues when inserting into a result database so casting it') + cdmDatabaseId <- as.character(cdmDatabaseId) + } + result <- list( connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, cdmDatabaseName = cdmDatabaseName, + cdmDatabaseId = cdmDatabaseId, tempEmulationSchema = tempEmulationSchema, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = cohortTable, outcomeDatabaseSchema = outcomeDatabaseSchema, outcomeTable = outcomeTable, - cohortId = cohortId , + targetId = targetId, outcomeIds = outcomeIds, cdmVersion = cdmVersion ) @@ -188,10 +220,10 @@ getPlpData <- function( checkIsClass(databaseDetails, 'databaseDetails') checkIsClass(restrictPlpDataSettings, 'restrictPlpDataSettings') - if(is.null(databaseDetails$cohortId)) - stop('User must input cohortId') - if(length(databaseDetails$cohortId)>1) - stop('Currently only supports one cohortId at a time') + if(is.null(databaseDetails$targetId)) + stop('User must input targetId') + if(length(databaseDetails$targetId)>1) + stop('Currently only supports one targetId at a time') if(is.null(databaseDetails$outcomeIds)) stop('User must input outcomeIds') #ToDo: add other checks the inputs are valid @@ -202,16 +234,22 @@ getPlpData <- function( ParallelLogger::logTrace("\nConstructing the at risk cohort") if(!is.null(restrictPlpDataSettings$sampleSize)) writeLines(paste("\n Sampling ",restrictPlpDataSettings$sampleSize, " people")) - renderedSql <- SqlRender::loadRenderTranslateSql( - "CreateCohorts.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - tempEmulationSchema = databaseDetails$tempEmulationSchema, + + pathToSql <- system.file( + paste("sql/", "sql_server", + sep = ""), + 'CreateCohorts.sql', + package = "PatientLevelPrediction" + ) + + renderedSql <- readChar(pathToSql, file.info(pathToSql)$size) + renderedSql <- SqlRender::render( + sql = renderedSql, cdm_database_schema = databaseDetails$cdmDatabaseSchema, cohort_database_schema = databaseDetails$cohortDatabaseSchema, cohort_table = databaseDetails$cohortTable, cdm_version = databaseDetails$cdmVersion, - cohort_id = databaseDetails$cohortId, + target_id = databaseDetails$targetId, study_start_date = restrictPlpDataSettings$studyStartDate, study_end_date = restrictPlpDataSettings$studyEndDate, first_only = restrictPlpDataSettings$firstExposureOnly, @@ -219,36 +257,57 @@ getPlpData <- function( use_sample = !is.null(restrictPlpDataSettings$sampleSize), sample_number = restrictPlpDataSettings$sampleSize ) + renderedSql <- SqlRender::translate( + sql = renderedSql, + targetDialect = dbms, + tempEmulationSchema = databaseDetails$tempEmulationSchema + ) + DatabaseConnector::executeSql(connection, renderedSql) ParallelLogger::logTrace("Fetching cohorts from server") start <- Sys.time() - cohortSql <- SqlRender::loadRenderTranslateSql( - "GetCohorts.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - tempEmulationSchema = databaseDetails$tempEmulationSchema, + + pathToSql <- system.file( + paste("sql/", "sql_server", + sep = ""), + "GetCohorts.sql", + package = "PatientLevelPrediction" + ) + + cohortSql <- readChar(pathToSql, file.info(pathToSql)$size) + + cohortSql <- SqlRender::render( + sql = cohortSql, cdm_version = databaseDetails$cdmVersion ) + + cohortSql <- SqlRender::translate( + sql = cohortSql, + targetDialect = dbms, + tempEmulationSchema = databaseDetails$tempEmulationSchema + ) cohorts <- DatabaseConnector::querySql(connection, cohortSql) colnames(cohorts) <- SqlRender::snakeCaseToCamelCase(colnames(cohorts)) - metaData.cohort <- list(cohortId = databaseDetails$cohortId) + metaData.cohort <- list(targetId = databaseDetails$targetId) - if(nrow(cohorts)==0) + if(nrow(cohorts)==0){ stop('Target population is empty') + } delta <- Sys.time() - start ParallelLogger::logTrace(paste("Loading cohorts took", signif(delta, 3), attr(delta, "units"))) - - #covariateSettings$useCovariateCohortIdIs1 <- TRUE - covariateData <- FeatureExtraction::getDbCovariateData(connection = connection, - oracleTempSchema = databaseDetails$tempEmulationSchema, - cdmDatabaseSchema = databaseDetails$cdmDatabaseSchema, - cdmVersion = databaseDetails$cdmVersion, - cohortTable = "#cohort_person", - cohortTableIsTemp = TRUE, - rowIdField = "row_id", - covariateSettings = covariateSettings) + + covariateData <- FeatureExtraction::getDbCovariateData( + connection = connection, + oracleTempSchema = databaseDetails$tempEmulationSchema, + cdmDatabaseSchema = databaseDetails$cdmDatabaseSchema, + cdmVersion = databaseDetails$cdmVersion, + cohortTable = "#cohort_person", + cohortTableIsTemp = TRUE, + rowIdField = "row_id", + covariateSettings = covariateSettings + ) # add indexes for tidyCov + covariate summary Andromeda::createIndex(covariateData$covariates, c('rowId'), indexName = 'covariates_rowId') @@ -260,23 +319,38 @@ getPlpData <- function( if(max(databaseDetails$outcomeIds)!=-999){ ParallelLogger::logTrace("Fetching outcomes from server") start <- Sys.time() - outcomeSql <- SqlRender::loadRenderTranslateSql( - "GetOutcomes.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - tempEmulationSchema = databaseDetails$tempEmulationSchema, + + pathToSql <- system.file( + paste("sql/", "sql_server", + sep = ""), + "GetOutcomes.sql", + package = "PatientLevelPrediction" + ) + + outcomeSql <- readChar(pathToSql, file.info(pathToSql)$size) + + outcomeSql <- SqlRender::render( + sql = outcomeSql, cdm_database_schema = databaseDetails$cdmDatabaseSchema, outcome_database_schema = databaseDetails$outcomeDatabaseSchema, outcome_table = databaseDetails$outcomeTable, outcome_ids = databaseDetails$outcomeIds, cdm_version = databaseDetails$cdmVersion ) + + outcomeSql <- SqlRender::translate( + sql = outcomeSql, + targetDialect = dbms, + tempEmulationSchema = databaseDetails$tempEmulationSchema + ) + outcomes <- DatabaseConnector::querySql(connection, outcomeSql) colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes)) metaData.outcome <- data.frame(outcomeIds = databaseDetails$outcomeIds) attr(outcomes, "metaData") <- metaData.outcome - if(nrow(outcomes)==0) + if(nrow(outcomes)==0){ stop('No Outcomes') + } metaData.cohort$attrition <- getCounts2(cohorts,outcomes, "Original cohorts") attr(cohorts, "metaData") <- metaData.cohort @@ -288,17 +362,22 @@ getPlpData <- function( } - - # Remove temp tables: - renderedSql <- SqlRender::loadRenderTranslateSql( - "RemoveCohortTempTables.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, + pathToSql <- system.file( + paste("sql/", "sql_server", + sep = ""), + "RemoveCohortTempTables.sql", + package = "PatientLevelPrediction" + ) + + removeSql <- readChar(pathToSql, file.info(pathToSql)$size) + removeSql <- SqlRender::translate( + sql = removeSql, + targetDialect = dbms, tempEmulationSchema = databaseDetails$tempEmulationSchema ) - DatabaseConnector::executeSql(connection, renderedSql, progressBar = FALSE, reportOverallTime = FALSE) + DatabaseConnector::executeSql(connection, removeSql, progressBar = FALSE, reportOverallTime = FALSE) #DatabaseConnector::disconnect(connection) metaData <- covariateData$metaData @@ -336,7 +415,7 @@ getPlpData <- function( print.plpData <- function(x, ...) { writeLines("plpData object") writeLines("") - writeLines(paste("At risk concept ID:", attr(x$cohorts, "metaData")$cohortId)) + writeLines(paste("At risk concept ID:", attr(x$cohorts, "metaData")$targetId)) writeLines(paste("Outcome concept ID(s):", paste(attr(x$outcomes, "metaData")$outcomeIds, collapse = ","))) } @@ -366,7 +445,7 @@ summary.plpData <- function(object,...){ print.summary.plpData <- function(x, ...) { writeLines("plpData object summary") writeLines("") - writeLines(paste("At risk cohort concept ID:", x$metaData$cohortId)) + writeLines(paste("At risk cohort concept ID:", x$metaData$targetId)) writeLines(paste("Outcome concept ID(s):", x$metaData$outcomeIds, collapse = ",")) writeLines("") writeLines(paste("People:", paste(x$people))) diff --git a/R/FeatureEngineering.R b/R/FeatureEngineering.R index 24b2d28b3..a5f4bc1ca 100644 --- a/R/FeatureEngineering.R +++ b/R/FeatureEngineering.R @@ -16,6 +16,30 @@ # limitations under the License. +featureEngineer <- function(data, featureEngineeringSettings){ + + ParallelLogger::logInfo('Starting Feature Engineering') + + # if a single setting, make it a list + if(inherits(featureEngineeringSettings, 'featureEngineeringSettings')){ + featureEngineeringSettings <- list(featureEngineeringSettings) + } + + for(featureEngineeringSetting in featureEngineeringSettings){ + fun <- attr(featureEngineeringSetting, "fun") + args <- list(trainData = data, + featureEngineeringSettings = featureEngineeringSetting) + ParallelLogger::logInfo(paste0('Applying ',fun)) + data <- do.call(eval(parse(text = fun)), args) + } + + attr(data, 'metaData')$featureEngineeringSettings <- featureEngineeringSettings + + ParallelLogger::logInfo('Done Feature Engineering') + + return(data) + +} #' Create the settings for defining any feature engineering that will be done #' @@ -23,7 +47,7 @@ #' Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings #' #' @param type (character) Choice of: \itemize{ -#' \item{'none'}{ No feature engineering - this is the default } +#' \item'none' No feature engineering - this is the default #' } #' #' @return @@ -54,7 +78,11 @@ createFeatureEngineeringSettings <- function(type = 'none'){ #' @export createUnivariateFeatureSelection <- function(k = 100){ - checkIsClass(k, c('numeric','integer')) + if (inherits(k, 'numeric')) { + k <- as.integer(k) + } + + checkIsClass(k, 'integer') checkHigherEqual(k, 0) featureEngineeringSettings <- list(k = k) @@ -95,6 +123,296 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){ return(featureEngineeringSettings) } +#' Create the settings for adding a spline for continuous variables +#' +#' @details +#' Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings +#' +#' @param continousCovariateId The covariateId to apply splines to +#' @param knots Either number of knots of vector of split values +#' @param analysisId The analysisId to use for the spline covariates +#' +#' @return +#' An object of class \code{featureEngineeringSettings} +#' @export +createSplineSettings <- function( + continousCovariateId, + knots, + analysisId = 683 + ){ + + checkIsClass(continousCovariateId, c('numeric','integer')) + checkIsClass(knots, c('numeric','integer')) + + featureEngineeringSettings <- list( + continousCovariateId = continousCovariateId, + knots = knots, + analysisId = analysisId + ) + + attr(featureEngineeringSettings, "fun") <- "splineCovariates" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + + return(featureEngineeringSettings) +} + +splineCovariates <- function( + trainData, + featureEngineeringSettings, + knots = NULL + ){ + + ParallelLogger::logInfo('Starting splineCovariates') + + if(is.null(knots)){ + + if (length(featureEngineeringSettings$knots) == 1) { + measurements <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == !!featureEngineeringSettings$continousCovariateId) %>% + as.data.frame() + knots <- measurements$covariateValue %>% + stats::quantile(seq(0.01, 0.99, length.out = featureEngineeringSettings$knots)) + } else { + knots <- featureEngineeringSettings$knots + } + + } + + # apply the spline mapping + trainData <- splineMap( + data = trainData, + covariateId = featureEngineeringSettings$continousCovariateId, + analysisId = featureEngineeringSettings$analysisId, + knots = knots + ) + + featureEngineering <- list( + funct = 'splineCovariates', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + knots = knots + ) + ) + + # add the feature engineering in + attr(trainData, 'metaData')$featureEngineering = listAppend( + attr(trainData, 'metaData')$featureEngineering, + featureEngineering + ) + ParallelLogger::logInfo('Finished splineCovariates') + + return(trainData) +} + +# create the spline map to add spline columns +splineMap <- function( + data, + covariateId, + analysisId, + knots +){ + + ParallelLogger::logInfo('Starting splineMap') + measurements <- data$covariateData$covariates %>% + dplyr::filter(.data$covariateId == !!covariateId) %>% + as.data.frame() + + designMatrix <- splines::bs( + x = measurements$covariateValue,#knots[1]:knots[length(knots)], + knots = knots[2:(length(knots) - 1)], + Boundary.knots = knots[c(1, length(knots))] + ) + + data$covariateData$covariates <- data$covariateData$covariates %>% + dplyr::filter(.data$covariateId != !!covariateId) + + # get the covariate name + details <- data$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == !!covariateId) %>% + as.data.frame() + covariateName <- details$covariateName + + data$covariateData$covariateRef <- data$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId != !!covariateId) + + # remove last 3 numbers as this was old analysis id + covariateId <- floor(covariateId/1000) + + # add the spline columns + for(i in 1:ncol(designMatrix)){ + Andromeda::appendToTable( + tbl = data$covariateData$covariates, + data = data.frame( + rowId = measurements$rowId, + covariateId = covariateId*10000+i*1000+analysisId, + covariateValue = designMatrix[,i] + ) + ) + } + + # add the covariates to the ref table + Andromeda::appendToTable( + tbl = data$covariateData$covariateRef, + data = data.frame( + covariateId = covariateId*10000+(1:(ncol(designMatrix)))*1000+analysisId, + covariateName = paste( + paste0(covariateName," spline component "), + 1:ncol(designMatrix) + ), + conceptId = 0, + analysisId = analysisId + ) + ) + + # add analysisRef for the first time a spline is added + analysisRef <- data$covariateData$analysisRef %>% as.data.frame() + if(!analysisId %in% analysisRef$analysisId){ + Andromeda::appendToTable( + tbl = data$covariateData$analysisRef, + data = data.frame( + analysisId = analysisId, + analysisName = 'splines', + domainId = 'feature engineering', + startDay = 0, + endDay = 0, + isBinary = 'N', + missingMeansZero = 'N' + ) + ) + } + ParallelLogger::logInfo('Finished splineMap') + return(data) +} + + + +#' Create the settings for adding a spline for continuous variables +#' +#' @details +#' Returns an object of class \code{featureEngineeringSettings} that specifies how to do stratified imputation +#' +#' @param covariateId The covariateId that needs imputed values +#' @param ageSplits A vector of age splits in years to create age groups +#' +#' @return +#' An object of class \code{featureEngineeringSettings} +#' @export +createStratifiedImputationSettings <- function( + covariateId, + ageSplits = NULL +){ + + checkIsClass(covariateId, c('numeric','integer')) + checkIsClass(ageSplits, c('numeric','integer')) + + featureEngineeringSettings <- list( + covariateId = covariateId, + ageSplits = ageSplits + ) + + attr(featureEngineeringSettings, "fun") <- "stratifiedImputeCovariates" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + + return(featureEngineeringSettings) +} + +stratifiedImputeCovariates <- function( + trainData, + featureEngineeringSettings, + stratifiedMeans = NULL +){ + + if(is.null(stratifiedMeans)){ + + stratifiedMeans <- calculateStratifiedMeans( + trainData = trainData, + featureEngineeringSettings = featureEngineeringSettings + ) + + } + + trainData <- imputeMissingMeans( + trainData = trainData, + covariateId = featureEngineeringSettings$covariateId, + ageSplits = featureEngineeringSettings$ageSplits, + stratifiedMeans = stratifiedMeans + ) + + return(trainData) +} + +calculateStratifiedMeans <- function( + trainData, + featureEngineeringSettings +){ + if(is.null(featureEngineeringSettings$ageSplits)){ + trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear/5) + } else{ + trainData$cohorts$ageGroup <- rep(0, length(trainData$cohorts$ageYear)) + for(i in 1:length(featureEngineeringSettings$ageSplits)){ + trainData$cohorts$ageGroup[trainData$cohorts$ageYear > featureEngineeringSettings$ageSplits[i]] <- i + } + } + + trainData$covariateData$cohorts <- trainData$cohorts[,c('rowId', 'ageGroup', 'gender')] + + stratifiedMeans <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == !!featureEngineeringSettings$covariateId) %>% + dplyr::inner_join( + y = trainData$covariateData$cohorts, + by = c('rowId') + ) %>% + dplyr::group_by(.data$ageGroup, .data$gender) %>% + dplyr::summarise(covariateValue = mean(.data$covariateValue, na.rm = TRUE)) %>% + as.data.frame() + + return(stratifiedMeans) +} + +imputeMissingMeans <- function( + trainData, + covariateId, + ageSplits, + stratifiedMeans +){ + + if(is.null(ageSplits)){ + trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear/5) + } else{ + trainData$cohorts$ageGroup <- rep(0, length(trainData$cohorts$ageYear)) + for(i in 1:length(ageSplits)){ + trainData$cohorts$ageGroup[trainData$cohorts$ageYear > ageSplits[i]] <- i + } + } + + rowIdsWithValues <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId == !! covariateId) %>% + dplyr::select('rowId') %>% + dplyr::pull() + rowIdsWithMissingValues <- trainData$cohorts$rowId[!trainData$cohorts$rowId %in% rowIdsWithValues] + + + imputedData <- trainData$cohorts %>% + dplyr::filter(.data$rowId %in% rowIdsWithMissingValues) %>% + dplyr::select('rowId', 'ageGroup', 'gender') %>% + dplyr::left_join( + y = stratifiedMeans, + by = c('ageGroup', 'gender') + ) %>% + dplyr::mutate( + covariateId = !!covariateId, + covariateValue = .data$covariateValue + ) %>% + dplyr::select('rowId', 'covariateId', 'covariateValue') + + Andromeda::appendToTable( + tbl = trainData$covariateData$covariates, + data = imputedData + ) + + return(trainData) +} + univariateFeatureSelection <- function( trainData, featureEngineeringSettings, @@ -122,13 +440,15 @@ univariateFeatureSelection <- function( SelectKBest <- sklearn$feature_selection$SelectKBest chi2 <- sklearn$feature_selection$chi2 - kbest <- SelectKBest(chi2, k = featureEngineeringSettings$k)$fit(X, y) + kbest <- SelectKBest(chi2, k = featureEngineeringSettings$k)$fit(X, y$outcomeCount) kbest$scores_ <- np$nan_to_num(kbest$scores_) - threshold <- -np$sort(-kbest$scores_)[(featureEngineeringSettings$k-1)] - - inc <- kbest$scores_ >= threshold + + # taken from sklearn code, matches the application during transform call + k <- featureEngineeringSettings$k + mask <- np$zeros(length(kbest$scores_), dtype='bool') + mask[np$argsort(kbest$scores_, kind="mergesort")+1][(length(kbest$scores_)-k+1):length(kbest$scores_)] <- TRUE - covariateIdsInclude <- covariateMap[inc,]$covariateId + covariateIdsInclude <- covariateMap[mask,]$covariateId } trainData$covariateData$covariates <- trainData$covariateData$covariates %>% @@ -137,7 +457,7 @@ univariateFeatureSelection <- function( trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% dplyr::filter(.data$covariateId %in% covariateIdsInclude) - featureEngeering <- list( + featureEngineering <- list( funct = 'univariateFeatureSelection', settings = list( featureEngineeringSettings = featureEngineeringSettings, @@ -147,7 +467,7 @@ univariateFeatureSelection <- function( attr(trainData, 'metaData')$featureEngineering = listAppend( attr(trainData, 'metaData')$featureEngineering, - featureEngeering + featureEngineering ) return(trainData) @@ -184,7 +504,7 @@ randomForestFeatureSelection <- function( max_depth = featureEngineeringSettings$max_depth #17 rf = sklearn$ensemble$RandomForestClassifier( - max_features = 'auto', + max_features = 'sqrt', n_estimators = as.integer(ntrees), max_depth = as.integer(max_depth), min_samples_split = as.integer(2), @@ -226,25 +546,4 @@ randomForestFeatureSelection <- function( -featureEngineer <- function(data, featureEngineeringSettings){ - - ParallelLogger::logInfo('Starting Feature Engineering') - - # if a single setting, make it a list - if(class(featureEngineeringSettings) == 'featureEngineeringSettings'){ - featureEngineeringSettings <- list(featureEngineeringSettings) - } - - for(featureEngineeringSetting in featureEngineeringSettings){ - fun <- attr(featureEngineeringSetting, "fun") - args <- list(trainData = data, - featureEngineeringSettings = featureEngineeringSetting) - ParallelLogger::logInfo(paste0('Applying ',fun)) - data <- do.call(eval(parse(text = fun)), args) - } - - ParallelLogger::logInfo('Done Feature Engineering') - - return(data) - -} + diff --git a/R/FeatureImportance.R b/R/FeatureImportance.R index 7d2b934a6..6c73b1223 100644 --- a/R/FeatureImportance.R +++ b/R/FeatureImportance.R @@ -49,26 +49,37 @@ pfi <- function(plpResult, population, plpData, repeats = 1, if(is.null(covariates)){ covariates <- plpResult$model$covariateImportance %>% dplyr::filter(.data$covariateValue != 0) %>% - dplyr::select(.data$covariateId) %>% + dplyr::select("covariateId") %>% dplyr::pull() } # add code to format covariateData based on plpModel - # do feature engineering/selection - plpData$covariateData <- do.call( - applyFeatureengineering, - list(covariateData = plpData$covariateData, - settings = plpResult$model$settings$featureEngineering + + if(!is.null(plpResult$model$preprocessing$featureEngineering)){ + # do feature engineering/selection + ParallelLogger::logInfo('Running FE in model') + plpData <- do.call( + applyFeatureengineering, + list(plpData = plpData, + settings = plpResult$model$preprocessing$featureEngineering + ) ) - ) + } else{ + ParallelLogger::logInfo('No FE in model') + } - # do preprocessing - plpData$covariateData <- do.call( - applyTidyCovariateData, - list(covariateData = plpData$covariateData, - preprocessSettings = plpResult$model$settings$tidyCovariates + if(!is.null(plpResult$model$preprocessing$tidyCovariates)){ + # do preprocessing + ParallelLogger::logInfo('Applying data tidying done in model') + plpData$covariateData <- do.call( + applyTidyCovariateData, + list(covariateData = plpData$covariateData, + preprocessSettings = plpResult$model$preprocessing$tidyCovariates + ) ) - ) + } else{ + ParallelLogger::logInfo('No data tidying done in model') + } # apply prediction function pred <- do.call( @@ -94,23 +105,20 @@ pfi <- function(plpResult, population, plpData, repeats = 1, ParallelLogger::logInfo(paste0('Using all ', cores)) ParallelLogger::logInfo(paste0('Set cores input to use fewer...')) } - + getVpiSettings <- function(i) { + result <- list(plpModel = plpResult$model, + population = population, + plpDataLocation = plpDataLocation, + covariateId = covariates[i], + repeats = repeats) + return(result) + } + if (cores > 1) { cluster <- ParallelLogger::makeCluster(numberOfThreads = cores) ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda")) - - getVpiSettings <- function(i){ - result <-list(plpModel = plpResult$model, - population = population, - plpDataLocation = plpDataLocation, - covariateId = covariates[i], - repeats = repeats) - return(result) - } vpiSettings <- lapply(1:length(covariates), getVpiSettings) - - #lapply(vpiSettings, function(x) do.call(permutePerf, x)) aucP <- ParallelLogger::clusterApply(cluster = cluster, x = vpiSettings, fun = permutePerf, @@ -118,13 +126,15 @@ pfi <- function(plpResult, population, plpData, repeats = 1, progressBar = TRUE) ParallelLogger::stopCluster(cluster) + } else { + ParallelLogger::logInfo("Running in serial") + aucP <- lapply(1:length(covariates), function(i) { + permutePerf(getVpiSettings(i)) + }) + } aucP <- do.call(c, aucP) - - # do this in parellel - varImp <- data.frame(covariateId = covariates, - pfi = auc-aucP) - + pfi = auc - aucP) return(varImp) } @@ -178,7 +188,7 @@ permute <- function(plpDataLocation,cId,population){ #get analysisId aId <- plpData$covariateData$covariateRef %>% dplyr::filter(.data$covariateId == !!cId) %>% - dplyr::select(.data$analysisId) %>% dplyr::collect() + dplyr::select("analysisId") %>% dplyr::collect() # if analysis id is not 3 (age group), 4 (race) or 5 (ethnicity) if(!aId$analysisId %in% c(3,4,5)){ @@ -189,7 +199,7 @@ permute <- function(plpDataLocation,cId,population){ # find a new random selection of people and give them the covariate and value newPlp <- sample(population$rowId,nSamp) - newData <- tibble::as_tibble(cbind(rowId = newPlp,coi[,-1])) + newData <- dplyr::as_tibble(cbind(rowId = newPlp,coi[,-1])) # swap old covariate data with new plpData$covariateData$covariates <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId != !!cId) %>% dplyr::collect() @@ -204,20 +214,20 @@ permute <- function(plpDataLocation,cId,population){ # sample the pop to replace swapPlp <- sample(population$rowId,nSamp) - haveCidDataSwapped <- tibble::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1])) + haveCidDataSwapped <- dplyr::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1])) # find the swapped people to switch connectedCovs <- plpData$covariateData$covariateRef %>% dplyr::filter(.data$analysisId == !!aId$analysisId) %>% dplyr::group_by(.data$covariateId) %>% - dplyr::select(.data$covariateId) %>% + dplyr::select("covariateId") %>% dplyr::collect() plpToSwap <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId %in% !!connectedCovs$covariateId) %>% dplyr::filter(.data$rowId %in% swapPlp) %>% dplyr::collect() - swappedForCid <- tibble::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1])) + swappedForCid <- dplyr::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1])) # swap old covariate data with new diff --git a/R/Fit.R b/R/Fit.R index a0a788f3b..30ef99710 100644 --- a/R/Fit.R +++ b/R/Fit.R @@ -29,24 +29,23 @@ #' data extracted from the CDM. #' @param modelSettings An object of class \code{modelSettings} created using one of the function: #' \itemize{ -#' \item{logisticRegressionModel()}{ A lasso logistic regression model} -#' \item{GBMclassifier()}{ A gradient boosting machine} -#' \item{RFclassifier()}{ A random forest model} -#' \item{GLMclassifier ()}{ A generalised linear model} -#' \item{KNNclassifier()}{ A KNN model} +#' \item setLassoLogisticRegression() A lasso logistic regression model +#' \item setGradientBoostingMachine() A gradient boosting machine +#' \item setRandomForest() A random forest model +#' \item setKNN() A KNN model #' } #' @param search The search strategy for the hyper-parameter selection (currently not used) #' @param analysisId The id of the analysis +#' @param analysisPath The path of the analysis #' @return #' An object of class \code{plpModel} containing: #' #' \item{model}{The trained prediction model} -#' \item{modelLoc}{The path to where the model is saved (if saved)} -#' \item{trainAuc}{The AUC obtained on the training set} -#' \item{trainCalibration}{The calibration obtained on the training set} -#' \item{modelSettings}{A list specifiying the model, preprocessing, outcomeId and cohortId} -#' \item{metaData}{The model meta data} -#' \item{trainingTime}{The time taken to train the classifier} +#' \item{preprocessing}{The preprocessing required when applying the model} +#' \item{prediction}{The cohort data.frame with the predicted risk column added} +#' \item{modelDesign}{A list specifiying the modelDesign settings used to fit the model} +#' \item{trainDetails}{The model meta data} +#' \item{covariateImportance}{The covariate importance for the model} #' #' #' @export @@ -54,7 +53,8 @@ fitPlp <- function( trainData, modelSettings, search = "grid", - analysisId + analysisId, + analysisPath ) { @@ -75,13 +75,18 @@ fitPlp <- function( fun <- eval(parse(text = modelSettings$fitFunction)) args <- list( trainData = trainData, - param = modelSettings$param, + modelSettings, # old: param = modelSettings$param, # make this model settings? search = search, - analysisId = analysisId + analysisId = analysisId, + analysisPath = analysisPath ) plpModel <- do.call(fun, args) ParallelLogger::logTrace('Returned from classifier function') + # adding trainDetails databaseId to all classifiers + # TODO - move other details into fit + plpModel$trainDetails$developmentDatabaseId = attr(trainData, "metaData")$cdmDatabaseId + class(plpModel) <- 'plpModel' return(plpModel) diff --git a/R/Formatting.R b/R/Formatting.R index 1aa584741..1e545b818 100644 --- a/R/Formatting.R +++ b/R/Formatting.R @@ -86,12 +86,12 @@ toSparseM <- function(plpData, cohort = NULL, map=NULL){ ParallelLogger::logInfo(paste0('toSparseM non temporal used')) - checkRam(newcovariateData, 0.9) # estimates size of RAM required and makes sure it is less that 90% + checkRam(newcovariateData) # estimates size of RAM required and prints it data <- Matrix::sparseMatrix( - i = newcovariateData$covariates %>% dplyr::select(.data$rowId) %>% dplyr::pull(), - j = newcovariateData$covariates %>% dplyr::select(.data$columnId) %>% dplyr::pull(), - x = newcovariateData$covariates %>% dplyr::select(.data$covariateValue) %>% dplyr::pull(), + i = newcovariateData$covariates %>% dplyr::select("rowId") %>% dplyr::pull(), + j = newcovariateData$covariates %>% dplyr::select("columnId") %>% dplyr::pull(), + x = newcovariateData$covariates %>% dplyr::select("covariateValue") %>% dplyr::pull(), dims=c(maxX,maxY) ) @@ -108,9 +108,13 @@ toSparseM <- function(plpData, cohort = NULL, map=NULL){ return(result) } -# this functions takes covariate data and a cohort/population and remaps -# the covaiate and row ids -# restricts to pop and saves/creates mapping +#' Map covariate and row Ids so they start from 1 +#' @description this functions takes covariate data and a cohort/population and remaps +#' the covariate and row ids, restricts to pop and saves/creates mapping +#' @param covariateData a covariateData object +#' @param cohort if specified rowIds restricted to the ones in cohort +#' @param mapping A pre defined mapping to use +#' @export MapIds <- function( covariateData, cohort = NULL, @@ -123,7 +127,7 @@ MapIds <- function( # change the rowIds in cohort (if exists) if(!is.null(cohort)){ rowMap <- data.frame( - rowId = cohort %>% dplyr::select(.data$rowId) + rowId = cohort %>% dplyr::select("rowId") ) rowMap$xId <- 1:nrow(rowMap) } else{ @@ -166,17 +170,16 @@ MapIds <- function( newCovariateData$rowMap <- rowMap newCovariateData$covariates <- newCovariateData$covariates %>% dplyr::inner_join(newCovariateData$rowMap, by = 'rowId') %>% - dplyr::select(- .data$rowId) %>% - dplyr::rename(rowId = .data$xId) + dplyr::select(- "rowId") %>% + dplyr::rename(rowId = "xId") if(!is.null(cohort)){ # change the rowId in labels newCovariateData$cohort <- cohort %>% dplyr::inner_join(rowMap, by = 'rowId') %>% - #dplyr::select(- .data$rowId) %>% dplyr::rename( - originalRowId = .data$rowId, - rowId = .data$xId + originalRowId = "rowId", + rowId = "xId" ) %>% dplyr::arrange(.data$rowId) # make sure it is ordered lowest to highest } @@ -188,22 +191,12 @@ MapIds <- function( return(newCovariateData) } -checkRam <- function(covariateData, maxPercent){ - - ensure_installed('memuse') +checkRam <- function(covariateData){ nrowV <- covariateData$covariates %>% dplyr::summarise(size = dplyr::n()) %>% dplyr::collect() estRamB <- (nrowV$size/1000000*24000984) - ramFree <- memuse::Sys.meminfo() - ramFree <- as.double(ramFree$freeram) - - if(0.9*ramFree1) + if(length(nthread)>1){ stop(paste('nthreads must be length 1')) - if(!class(seed)%in%c('numeric','NULL', 'integer')) + } + if(!inherits(x = seed, what = c('numeric','NULL', 'integer'))){ stop('Invalid seed') - if(!class(ntrees) %in% c("numeric", "integer")) - stop('ntrees must be a numeric value >0 ') - if(sum(ntrees < 1)>0) - stop('ntrees must be greater that 0 or -1') - if(!class(maxDepth) %in% c("numeric", "integer")) - stop('maxDepth must be a numeric value >0') - if(sum(maxDepth < 1)>0) + } + if(!inherits(x = ntrees, what = c("numeric", "integer"))){ + stop('ntrees must be a numeric value ') + } + if(sum(ntrees < 1) > 0 ){ + stop('ntrees must be greater than 0 or -1') + } + if(!inherits(x = maxDepth, what = c("numeric", "integer"))){ + stop('maxDepth must be a numeric value') + } + if(sum(maxDepth < 1) > 0){ stop('maxDepth must be greater that 0') - if(!class(minChildWeight) %in% c("numeric", "integer")) - stop('minChildWeight must be a numeric value >0') - if(sum(minChildWeight < 0)>0) + } + if(!inherits(x = minChildWeight, what = c("numeric", "integer"))){ + stop('minChildWeight must be a numeric value') + } + if(sum(minChildWeight < 0) > 0){ stop('minChildWeight must be greater that 0') - if(class(learnRate)!='numeric') - stop('learnRate must be a numeric value >0 and <= 1') - if(sum(learnRate <= 0)>0) + } + if(!inherits(x = learnRate, what = 'numeric')){ + stop('learnRate must be a numeric value') + } + if(sum(learnRate <= 0) > 0){ stop('learnRate must be greater that 0') - if(sum(learnRate > 1)>0) + } + if(sum(learnRate > 1) > 0){ stop('learnRate must be less that or equal to 1') - if(!class(earlyStopRound) %in% c("numeric", "integer", "NULL")) + } + if(!inherits(x = earlyStopRound, what = c("numeric", "integer", "NULL"))){ stop('incorrect class for earlyStopRound') - if (sum(lambda < 0)>0) + } + if (!inherits(x = lambda, what = c("numeric", "integer"))){ + stop('lambda must be a numeric value') + } + if (sum(lambda < 0) > 0){ stop('lambda must be 0 or greater') - if (!class(lambda) %in% c("numeric", "integer")) - stop('lambda must be a numeric value >= 0') - if (sum(alpha < 0)>0) + } + if(!inherits(x = alpha, what = c("numeric", "integer"))){ + stop('alpha must be a numeric value') + } + if (sum(alpha < 0) > 0){ stop('alpha must be 0 or greater') - if (!class(alpha) %in% c("numeric", "integer")) - stop('alpha must be a numeric value >= 0') - if (sum(scalePosWeight < 0)>0) - stop('scalePosWeight must be 0 or greater') - if (!class(scalePosWeight) %in% c("numeric", "integer")) + } + if (!inherits(x = scalePosWeight, what = c("numeric", "integer"))){ stop('scalePosWeight must be a numeric value >= 0') + } + if (sum(scalePosWeight < 0) > 0){ + stop('scalePosWeight must be 0 or greater') + } + + paramGrid <- list( + ntrees = ntrees, + earlyStopRound = earlyStopRound, + maxDepth = maxDepth, + minChildWeight = minChildWeight, + learnRate = learnRate, + lambda = lambda, + alpha = alpha, + scalePosWeight = scalePosWeight + ) - - param <- split( - expand.grid( - ntrees=ntrees, - earlyStopRound=earlyStopRound, - maxDepth=maxDepth, - minChildWeight=minChildWeight, - learnRate=learnRate, - lambda=lambda, - alpha=alpha, - scalePosWeight=scalePosWeight - ), - 1:(length(ntrees)*length(maxDepth)*length(minChildWeight)*length(learnRate)* - length(earlyStopRound)*length(lambda)*length(alpha)*length(scalePosWeight)) - ) + param <- listCartesian(paramGrid) attr(param, 'settings') <- list( - modeType = 'Xgboost', + modelType = 'Xgboost', seed = seed[[1]], modelName = "Gradient Boosting Machine", threads = nthread[1], @@ -131,8 +146,8 @@ varImpXgboost <- function( varImp <- merge(covariateMap, varImp, by.x='columnId', by.y='Feature') varImp <- varImp %>% dplyr::mutate(included = 1) %>% - dplyr::rename(covariateValue = .data$Gain) %>% - dplyr::select(.data$covariateId, .data$covariateValue, .data$included) + dplyr::rename(covariateValue = "Gain") %>% + dplyr::select("covariateId", "covariateValue", "included") return(varImp) @@ -144,13 +159,13 @@ predictXgboost <- function( cohort ){ - if(class(data) == 'plpData'){ + if(inherits(data , 'plpData')){ # convert matrixObjects <- toSparseM( plpData = data, cohort = cohort, map = plpModel$covariateImportance %>% - dplyr::select(.data$columnId, .data$covariateId) + dplyr::select("columnId", "covariateId") ) # use the include?? @@ -162,7 +177,7 @@ predictXgboost <- function( newData <- data } - if(class(plpModel) == 'plpModel'){ + if(inherits(plpModel, 'plpModel')){ model <- plpModel$model } else{ model <- plpModel @@ -175,8 +190,8 @@ predictXgboost <- function( # fix the rowIds to be the old ones? # now use the originalRowId and remove the matrix rowId prediction <- prediction %>% - dplyr::select(-.data$rowId) %>% - dplyr::rename(rowId = .data$originalRowId) + dplyr::select(-"rowId") %>% + dplyr::rename(rowId = "originalRowId") attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType")) @@ -213,7 +228,7 @@ fitXgboost <- function( outcomes <- sum(labels$outcomeCount>0) N <- nrow(labels) outcomeProportion <- outcomes/N - + set.seed(settings$seed) model <- xgboost::xgb.train( data = train, params = list( diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index c18e8e153..d73a4e9f0 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -1,14 +1,25 @@ # fix issue with nrow - temp fix for me locally nrow <- function(x){UseMethod("nrow",x)} +#' @exportS3Method NULL nrow.default <- base::nrow +#' @exportS3Method NULL nrow.tbl <- function(x){x %>% dplyr::tally() %>% dplyr::pull()} +removeInvalidString <- function(string){ + modString <- gsub('_', ' ', string) + modString <- gsub('\\.', ' ', modString) + modString <- gsub("[[:punct:]]", "", modString) + modString <- gsub(' ', '_', modString) + return(modString) +} + + # Borrowed from devtools: https://github.com/hadley/devtools/blob/ba7a5a4abd8258c52cb156e7b26bb4bf47a79f0b/R/utils.r#L44 -is_installed <- function (pkg, version = 0) { +is_installed <- function (pkg) { installed_version <- tryCatch(utils::packageVersion(pkg), error = function(e) NA) - !is.na(installed_version) && installed_version >= version + !is.na(installed_version) } # Borrowed and adapted from devtools: https://github.com/hadley/devtools/blob/ba7a5a4abd8258c52cb156e7b26bb4bf47a79f0b/R/utils.r#L74 @@ -18,7 +29,7 @@ ensure_installed <- function(pkg) { if (interactive()) { message(msg, "\nWould you like to install it?") if (utils::menu(c("Yes", "No")) == 1) { - if(pkg%in%c('BigKnn', "IterativeHardThresholding")){ + if(pkg%in%c('BigKnn', "IterativeHardThresholding", "ShinyAppBuilder", "ResultModelManager")){ # add code to check for devtools... dvtCheck <- tryCatch(utils::packageVersion('devtools'), @@ -40,10 +51,13 @@ ensure_installed <- function(pkg) { } } +#' Create a temporary model location +#' +#' @export createTempModelLoc <- function(){ repeat{ - loc <- paste(tempdir(), paste0('python_models_',sample(10002323,1)), sep = '\\') - #loc <- file.path(tempdir(), paste0('python_models_',sample(10002323,1))) + ##loc <- paste(tempdir(), paste0('python_models_',sample(10002323,1)), sep = '\\') + loc <- file.path(tempdir(), paste0('python_models_',sample(10002323,1))) if(!dir.exists(loc)){ return(loc) } @@ -82,45 +96,48 @@ listAppend <- function(a, b){ #' #' @param envname A string for the name of the virtual environment (default is 'PLP') #' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users +#' @param condaPythonVersion String, Python version to use when creating a conda environment #' #' @export -configurePython <- function(envname='PLP', envtype=NULL){ +configurePython <- function(envname='PLP', envtype=NULL, condaPythonVersion="3.11"){ if(is.null(envtype)){ if(getOs()=='windows'){ - envtype=='conda' + envtype <- "conda" } else { - envtype=='python' + envtype <- "python" } } if(envtype=='conda'){ pEnvironments <- reticulate::conda_list() if(length(pEnvironments) > 0 && envname %in% pEnvironments$name){ - warning(paste0('Conda environment ', envname,' exists. You can use removePython() to remove if you want to fresh config')) + location <- '' + warning(paste0('Conda environment ', envname,' exists. You can use reticulate::conda_remove() to remove if you want to fresh config')) } else { ParallelLogger::logInfo(paste0('Creating virtual conda environment called ', envname)) - location <- reticulate::conda_create(envname=envname, packages = "python", conda = "auto") + location <- reticulate::conda_create(envname=envname, packages = paste0("python==", condaPythonVersion), conda = "auto") } - packages <- c('numpy','scipy','scikit-learn', 'pandas','pydotplus','joblib', 'sklearn-json') + packages <- c('numpy','scipy','scikit-learn', 'pandas','pydotplus','joblib') ParallelLogger::logInfo(paste0('Adding python dependancies to ', envname)) reticulate::conda_install(envname=envname, packages = packages, forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto") } else { pEnvironments <- reticulate::virtualenv_list() if(length(pEnvironments) > 0 && envname %in% pEnvironments){ - warning(paste0('Python environment ', envname,' exists. You can use removePython() to remove if you want to fresh config')) + location <- '' + warning(paste0('Python environment ', envname,' exists.')) } else { ParallelLogger::logInfo(paste0('Creating virtual python environment called ', envname)) location <- reticulate::virtualenv_create(envname=envname) } - packages <- c('numpy', 'scikit-learn','scipy', 'pandas','pydotplus','sklearn-json') + packages <- c('numpy', 'scikit-learn','scipy', 'pandas','pydotplus') ParallelLogger::logInfo(paste0('Adding python dependancies to ', envname)) reticulate::virtualenv_install(envname=envname, packages = packages, ignore_installed = TRUE) } - return(location) + return(invisible(location)) } #' Use the virtual environment created using configurePython() @@ -176,3 +193,100 @@ getOs <- function(){ tolower(os) } + +# Borrowed and adapted from Hmisc: https://github.com/harrelfe/Hmisc/blob/39011dae3af3c943e67401ed6000644014707e8b/R/cut2.s +cut2 <- function(x, g, m = 150, digits = 3) { + + method <- 1 ## 20may02 + x.unique <- sort(unique(c(x[!is.na(x)]))) + min.dif <- min(diff(x.unique))/2 + min.dif.factor <- 1 + + oldopt <- options('digits') + options(digits=digits) + on.exit(options(oldopt)) + + xlab <- attr(x, 'label') + + nnm <- sum(!is.na(x)) + if(missing(g)) g <- max(1,floor(nnm/m)) + if(g < 1) + stop('g must be >=1, m must be positive') + + options(digits=15) + n <- table(x) + xx <- as.double(names(n)) + options(digits = digits) + cum <- cumsum(n) + m <- length(xx) + + y <- as.integer(ifelse(is.na(x),NA,1)) + labs <- character(g) + cuts <- stats::approx(cum, xx, xout=(1:g)*nnm/g, + method='constant', rule=2, f=1)$y + cuts[length(cuts)] <- max(xx) + lower <- xx[1] + upper <- 1e45 + up <- low <- double(g) + i <- 0 + for(j in 1:g) { + cj <- if(method==1 || j==1) cuts[j] else { + if(i==0) + stop('program logic error') + # Not used unique values found in table(x) + s <- if(is.na(lower)) FALSE else xx >= lower + cum.used <- if(all(s)) 0 else max(cum[!s]) + if(j==m) max(xx) else if(sum(s)<2) max(xx) else + stats::approx(cum[s]-cum.used, xx[s], xout=(nnm-cum.used)/(g-j+1), + method='constant', rule=2, f=1)$y + } + + if(cj==upper) next + + i <- i + 1 + upper <- cj + # assign elements to group i + # y contains the group number in the end + y[x >= (lower-min.dif.factor*min.dif)] <- i + low[i] <- lower + lower <- if(j==g) upper else min(xx[xx > upper]) + + if(is.na(lower)) lower <- upper + + up[i] <- lower + } + + low <- low[1:i] + up <- up[1:i] + # Are the bounds different? + variation <- logical(i) + for(ii in 1:i) { + r <- range(x[y==ii], na.rm=TRUE) + variation[ii] <- diff(r) > 0 + } + flow <- do.call(format,c(list(low), digits = 3)) + fup <- do.call(format,c(list(up), digits = 3)) + bb <- c(rep(')',i-1),']') + labs <- ifelse(low==up | (!variation), flow, + paste('[',flow,',',fup,bb,sep='')) + ss <- y==0 & !is.na(y) + if(any(ss)) + stop(paste('categorization error in cut2. Values of x not appearing in any interval:\n', + paste(format(x[ss],digits=12),collapse=' '), + '\nLower endpoints:', + paste(format(low,digits=12), collapse=' '), + '\nUpper endpoints:', + paste(format(up,digits=12),collapse=' '))) + + y <- structure(y, class='factor', levels=labs) + + attr(y,'class') <- "factor" + if(length(xlab)){ + #label(y) <- xlab # what is label? + # think the below does the same as the line above + class(y) <- 'labelled' + attr(y, 'label') <- xlab + } + + return(y) +} diff --git a/R/ImportFromCsv.R b/R/ImportFromCsv.R new file mode 100644 index 000000000..61536acd3 --- /dev/null +++ b/R/ImportFromCsv.R @@ -0,0 +1,584 @@ +#' Function to insert results into a database from csvs +#' @description +#' This function converts a folder with csv results into plp objects and loads them into a plp result database +#' +#' @details +#' The user needs to have plp csv results in a single folder and an existing plp result database +#' +#' @param csvFolder The location to the csv folder with the plp results +#' @param connectionDetails A connection details for the plp results database that the csv results will be inserted into +#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables to insert the csv results into +#' @param modelSaveLocation The location to save any models from the csv folder - this should be the same location you picked when inserting other models into the database +#' @param csvTableAppend A string that appends the csv file names +#' +#' @return +#' Returns a data.frame indicating whether the results were inported into the database +#' +#' @export +insertCsvToDatabase <- function( + csvFolder, + connectionDetails, + databaseSchemaSettings, + modelSaveLocation, + csvTableAppend = '' +){ + + ensure_installed('readr') + + ParallelLogger::logInfo('Starting input checks') + + csvFileNames <- tryCatch({ + dir(csvFolder, pattern = 'csv') + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + if(is.null(csvFileNames)){ + return(invisible(NULL)) + } + + if(!missing(csvTableAppend)){ + csvFileNamesNoAppend <- sub(csvTableAppend, '', csvFileNames) + } else{ + csvFileNamesNoAppend <- csvFileNames + } + + # check all tables are in folder + # settings/resultsDataModelSpecification.csv table_name + resultNames <- paste0(unique( + readr::read_csv( + system.file( + 'settings', + 'resultsDataModelSpecification.csv', + package = "PatientLevelPrediction" + ) + )$table_name + ), '.csv') + if(sum(csvFileNamesNoAppend %in% resultNames) != length(resultNames)){ + missingTables <- paste(resultNames[!resultNames %in% csvFileNamesNoAppend], collapse = ',') + ParallelLogger::logInfo(paste0('CSV folder missing these tables: ', missingTables)) + return(invisible(NULL)) + } + + alltables <- getTableNamesPlp( + connectionDetails = connectionDetails, + databaseSchema = databaseSchemaSettings$resultSchema + ) + + if(!tolower(paste0(databaseSchemaSettings$tablePrefix,'PERFORMANCES')) %in% alltables){ + ParallelLogger::logInfo( + paste0( + 'performance table: ',paste0(toupper(databaseSchemaSettings$tablePrefix),'PERFORMANCES'),' not found, result database only contains ', + paste(alltables, collapse = ',') + ) + ) + return(invisible(NULL)) + } + + ParallelLogger::logInfo('Input checks passed') + ParallelLogger::logInfo('Extracting cohort definitions') + # create cohortDefinitions: + cohortDefinitions <- extractCohortDefinitionsCSV( + csvFolder = csvFolder + ) + + ParallelLogger::logInfo('Extracting database details') + # create databaseList + databaseList <- extractDatabaseListCSV( + csvFolder = csvFolder + ) + + ParallelLogger::logInfo('Extracting performance ids') + performanceIds <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('performances', csvFileNames)]))$performance_id + + if(length(performanceIds) > 0 ){ + for(performanceId in performanceIds){ + ParallelLogger::logInfo( + paste0( + 'Converting and inserting performance id', + performanceId + ) + ) + # convert to runPlp + runPlp <- extractObjectFromCsv( + performanceId = performanceId, + csvFolder = csvFolder + ) + + # load into database + addRunPlpToDatabase( + runPlp = runPlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions, + modelSaveLocation = modelSaveLocation, + databaseList = databaseList + ) + } + } + + diagnosticIds <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('diagnostics', csvFileNames)]))$diagnostic_id + + if(length(diagnosticIds) > 0){ + for(diagnosticId in diagnosticIds){ + ParallelLogger::logInfo( + paste0( + 'Converting and inserting diagnostic id', + diagnosticId + ) + ) + diagnosePlp <- extractDiagnosticFromCsv( + diagnosticId = diagnosticId, + csvFolder = csvFolder + ) + if(!is.null(diagnosePlp)){ + tryCatch( + { + addDiagnosePlpToDatabase( + diagnosePlp = diagnosePlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions, + databaseList = databaseList + ) + }, error = function(e){ParallelLogger::logError(e)} + ) + } + + } + } + + + return(TRUE) + +} + + + + + +extractCohortDefinitionsCSV <- function( + csvFolder +){ + + # cohorts: cohort_id, cohort_definition_id, cohort_name + # cohort_definition: cohort_definition_id cohort_name description json sql_command + + cohortDefinitionName <- dir(csvFolder, pattern = 'cohort_definition.csv') + cohort_definition <- readr::read_csv(file.path(csvFolder, cohortDefinitionName)) + + result <- data.frame( + cohortId = cohort_definition$cohort_definition_id, + cohortName = cohort_definition$cohort_name, + json = cohort_definition$json, + sql = cohort_definition$sql_command + ) + + return(result) +} + +extractDatabaseListCSV <- function( + csvFolder +){ + # database_meta_data: database_id cdm_source_name cdm_source_abbreviation + # database_details: database_id database_meta_data_id + databaseMetaDataName <- dir(csvFolder, pattern = 'database_meta_data.csv') + databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName)) + + databaseList <- createDatabaseList( + cdmDatabaseSchemas = databaseMetaData$cdm_source_name, + cdmDatabaseNames = databaseMetaData$cdm_source_abbreviation, + databaseRefIds = databaseMetaData$database_id + ) + + return(databaseList) +} + + +getModelDesignSettingTable <- function(modeldesignsRow){ + result <- data.frame( + tableName = c('cohorts', 'cohorts', + 'population_settings', 'plp_data_settings', + 'model_settings', 'covariate_settings', 'sample_settings', + 'split_settings', 'feature_engineering_settings', + 'tidy_covariates_settings'), + idColumn = c('cohort_id', 'cohort_id', + 'population_setting_id', 'plp_data_setting_id', + 'model_setting_id', 'covariate_setting_id', 'sample_setting_id', + 'split_setting_id', 'feature_engineering_setting_id', + 'tidy_covariates_setting_id'), + jsonColumn = c('cohort_definition_id', 'cohort_definition_id', + 'population_settings_json', 'plp_data_settings_json', + 'model_settings_json', 'covariate_settings_json', 'sample_settings_json', + 'split_settings_json', 'feature_engineering_settings_json', + 'tidy_covariates_settings_json'), + convertJson = c(rep(F,2), rep(T, 8)), + value = c(modeldesignsRow$target_id, modeldesignsRow$outcome_id, + modeldesignsRow$population_setting_id, modeldesignsRow$plp_data_setting_id, + modeldesignsRow$model_setting_id, modeldesignsRow$covariate_setting_id, modeldesignsRow$sample_setting_id, + modeldesignsRow$split_setting_id, modeldesignsRow$feature_engineering_setting_id , + modeldesignsRow$tidy_covariates_setting_id), + modelDesignInput = c('targetId', 'outcomeId', + 'populationSettings', 'restrictPlpDataSettings', + 'modelSettings', 'covariateSettings', 'sampleSettings', + 'splitSettings', 'featureEngineeringSettings', + 'preprocessSettings') + ) + return(result) +} + +getModelDesignCsv <- function( + modelDesignSettingTable, + csvFolder = csvFolder +) { + + csvFileNames <- dir(csvFolder, pattern = '.csv') + + result <- list() + for(i in 1:nrow(modelDesignSettingTable)){ + table <- readr::read_csv(file.path(csvFolder, csvFileNames[grep(modelDesignSettingTable$tableName[i], csvFileNames)])) + ind <- table[modelDesignSettingTable$idColumn[i]] == modelDesignSettingTable$value[i] + result[[i]] <- table[ind,][modelDesignSettingTable$jsonColumn[i]] + if(modelDesignSettingTable$convertJson[i]){ + result[[i]] <- ParallelLogger::convertJsonToSettings(as.character(result[[i]])) + } else{ + # ids need to be integer + result[[i]] <- as.double(result[[i]]) + } + } + names(result) <- modelDesignSettingTable$modelDesignInput + + modelDesign <- do.call(what = PatientLevelPrediction::createModelDesign, args = result) + + return(modelDesign) +} + +getPerformanceEvaluationCsv <- function( + performanceId, + csvFolder +){ + + csvFileNames <- dir(csvFolder, pattern = '.csv') + + result <- list( + + evaluationStatistics = tryCatch( + { + res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('evaluation_statistics', csvFileNames)])) %>% + dplyr::filter(.data$performance_id == !!performanceId) %>% + dplyr::select(-"performance_id"); + colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res)); + res + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ), + + thresholdSummary = tryCatch({ + res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('threshold_summary', csvFileNames)])) %>% + dplyr::filter(.data$performance_id == !!performanceId) %>% + dplyr::select(-"performance_id"); + colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res)); + res + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ), + + calibrationSummary = tryCatch({ + res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('calibration_summary', csvFileNames)])) %>% + dplyr::filter(.data$performance_id == !!performanceId) %>% + dplyr::select(-"performance_id"); + colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res)); + res + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ), + + demographicSummary = tryCatch({ + res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('demographic_summary', csvFileNames)])) %>% + dplyr::filter(.data$performance_id == !!performanceId) %>% + dplyr::select(-"performance_id"); + colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res)); + res + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ), + + predictionDistribution = tryCatch({ + res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('prediction_distribution', csvFileNames)])) %>% + dplyr::filter(.data$performance_id == !!performanceId) %>% + dplyr::select(-"performance_id"); + colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res)); + res + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + ) + + return(result) + +} + +extractObjectFromCsv <- function( + performanceId, + csvFolder +){ + + csvFileNames <- dir(csvFolder, pattern = '.csv') + + # get the model design + # performance_id model_design_id development_database_id validation_database_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_development execution_date_time plp_version + performances <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('performances', csvFileNames)])) + poi <- performances[performances$performance_id == performanceId,,] + + modelDesignId <- poi$model_design_id + modeldesigns <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('model_designs', csvFileNames)])) + # model_design_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_setting_id covariate_setting_id sample_setting_id split_setting_id feature_engineering_setting_id tidy_covariates_setting_id + modeldesigns <- modeldesigns[modeldesigns$model_design_id == modelDesignId,,] + + modelDesignSettingTable <- getModelDesignSettingTable( + modeldesignsRow = modeldesigns + ) + + modelDesign <- getModelDesignCsv( + modelDesignSettingTable = modelDesignSettingTable, + csvFolder = csvFolder + ) + + covariateSummary <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('covariate_summary', csvFileNames)])) %>% + dplyr::filter(.data$performance_id == !!poi$performance_id) %>% + dplyr::select(-"performance_id") + colnames(covariateSummary) <- SqlRender::snakeCaseToCamelCase(colnames(covariateSummary)) + + performanceEvaluation <- getPerformanceEvaluationCsv( + performanceId = poi$performance_id, + csvFolder = csvFolder + ) + + modelMissing <- F + if(poi$model_development == 1){ + + modelsName <- dir(csvFolder, pattern = 'models.csv') + models <- readr::read_csv(file.path(csvFolder, modelsName)) + models <- models %>% + dplyr::filter(.data$model_design_id == !!poi$model_design_id ) %>% + dplyr::filter(.data$database_id == !!poi$development_database_id) + + modelLoc <- strsplit(x = models$plp_model_file, split = '/')[[1]][length(strsplit(x = models$plp_model_file, split = '/')[[1]])] + plpModel <- tryCatch({ + PatientLevelPrediction::loadPlpModel(file.path(csvFolder, 'models', modelLoc)) + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + + resultClass <- 'runPlp' + + if(is.null(modelLoc)){ + ParallelLogger::logInfo('Models missing from csv folder - just adding performance') + modelMissing <- T + } + + } + + if(poi$model_development == 0 | modelMissing){ + + # database_details: database_id database_meta_data_id + databaseMetaDataName <- dir(csvFolder, pattern = 'database_meta_data.csv') + databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName)) + databaseDetailsName <- dir(csvFolder, pattern = 'database_details.csv') + databaseDetails <- readr::read_csv(file.path(csvFolder, databaseDetailsName)) + databases <- merge(databaseDetails, databaseMetaData, by.x = 'database_meta_data_id', by.y = 'database_id') + + dev <- databases[databases$database_id == poi$development_database_id,,] + val <- databases[databases$database_id == poi$validation_database_id,,] + + developmentDatabase <- dev$cdm_source_name + developmentDatabaseId <- dev$database_meta_data_id + validationDatabase <- val$cdm_source_name + validationDatabaseId <- val$database_meta_data_id + + attritionName <- dir(csvFolder, pattern = 'attrition.csv') + attrition <- readr::read_csv(file.path(csvFolder, attritionName)) %>% + dplyr::filter(.data$performance_id == !!poi$performance_id) %>% + dplyr::select(-"performance_id") + colnames(attrition) <- SqlRender::snakeCaseToCamelCase(colnames(attrition)) + + cohortsName <- dir(csvFolder, pattern = 'cohorts.csv') + cohorts <- readr::read_csv(file.path(csvFolder, cohortsName)) + plpDataSetName <- dir(csvFolder, pattern = 'plp_data_settings.csv') + plpDataSet <- readr::read_csv(file.path(csvFolder, plpDataSetName)) + popSetName <- dir(csvFolder, pattern = 'population_settings.csv') + popSet <- readr::read_csv(file.path(csvFolder, popSetName)) + + # get the model + plpModel <- list( + model = 'external validation of model', + modelDesign = modelDesign, + validationDetails = list( + analysisId = '', + analysisSource = '', + developmentDatabase = developmentDatabase, + developmentDatabaseId = developmentDatabaseId, + validationDatabase = validationDatabase, + validationDatabaseId = validationDatabaseId, + + populationSettings = ParallelLogger::convertJsonToSettings( + as.character( + popSet %>% + dplyr::filter(.data$population_setting_id == !!poi$population_setting_id) %>% + dplyr::select("population_settings_json") + ) + ), + restrictPlpDataSettings = ParallelLogger::convertJsonToSettings( + as.character( + plpDataSet %>% + dplyr::filter(.data$plp_data_setting_id == !!poi$plp_data_setting_id) %>% + dplyr::select("plp_data_settings_json") + ) + ), + + outcomeId = as.double( + cohorts %>% + dplyr::filter(.data$cohort_id == !!poi$outcome_id) %>% + dplyr::select("cohort_definition_id") + ), + targetId = as.double( + cohorts %>% + dplyr::filter(.data$cohort_id == !!poi$target_id) %>% + dplyr::select("cohort_definition_id") + ), + + attrition = attrition + ) + ) + attr(plpModel, "predictionFunction") <- 'none' + attr(plpModel, "saveType") <- 'RtoJson' + class(plpModel) <- 'plpModel' + + resultClass <- 'externalValidatePlp' + } + + + result <- list( + executionSummary = list( + PackageVersion = list( + packageVersion = poi$plp_version + ), + #TotalExecutionElapsedTime = , + ExecutionDateTime = poi$execution_date_time + ), + model = plpModel, + performanceEvaluation = performanceEvaluation, + covariateSummary = covariateSummary, + analysisRef = list( + analysisId = '' + ) + ) + class(result) <- resultClass + + # return the object + return(result) + +} + +extractDiagnosticFromCsv <- function( + diagnosticId, + csvFolder +){ + + # diagnostic_id model_design_id database_id execution_date_time + csvFileNames <- dir(csvFolder, pattern = '.csv') + + # get the model design + # performance_id model_design_id development_database_id validation_database_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_development execution_date_time plp_version + diagnostics <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('diagnostics', csvFileNames)])) + if(length(diagnostics) == 0){ + ParallelLogger::logInfo('No diagnostics in csv results') + return(NULL) + } + doi <- diagnostics[diagnostics$diagnostic_id == diagnosticId,,] + if(nrow(doi) == 0){ + ParallelLogger::logInfo('No diagnostics in csv results with specified diagnosticId') + return(NULL) + } + + modelDesignId <- doi$model_design_id + modeldesigns <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('model_designs', csvFileNames)])) + # model_design_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_setting_id covariate_setting_id sample_setting_id split_setting_id feature_engineering_setting_id tidy_covariates_setting_id + modeldesigns <- modeldesigns[modeldesigns$model_design_id == modelDesignId,,] + + modelDesignSettingTable <- getModelDesignSettingTable( + modeldesignsRow = modeldesigns + ) + + modelDesign <- getModelDesignCsv( + modelDesignSettingTable = modelDesignSettingTable, + csvFolder = csvFolder + ) + + databaseMetaDataName <- dir(csvFolder, pattern = 'database_meta_data.csv') + databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName)) + databaseDetailsName <- dir(csvFolder, pattern = 'database_details.csv') + databaseDetails <- readr::read_csv(file.path(csvFolder, databaseDetailsName)) + databases <- merge(databaseDetails, databaseMetaData, by.x = 'database_meta_data_id', by.y = 'database_id') + + db <- databases[databases$database_id == doi$database_id] + + databaseSchema <- db$cdm_source_name + databaseId <- db$database_meta_data_id + + outcomesName <- dir(csvFolder, pattern = 'diagnostic_outcomes.csv') + outcomes <- readr::read_csv(file.path(csvFolder, outcomesName)) %>% + dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>% + dplyr::select(-"diagnostic_id") + colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes)) + + predictorsName <- dir(csvFolder, pattern = 'diagnostic_predictors.csv') + predictors <- readr::read_csv(file.path(csvFolder, predictorsName)) %>% + dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>% + dplyr::select(-"diagnostic_id") + colnames(predictors) <- SqlRender::snakeCaseToCamelCase(colnames(predictors)) + + participantsName <- dir(csvFolder, pattern = 'diagnostic_participants.csv') + participants <- readr::read_csv(file.path(csvFolder, participantsName)) %>% + dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>% + dplyr::select(-"diagnostic_id") + colnames(participants) <- SqlRender::snakeCaseToCamelCase(colnames(participants)) + + summaryName <- dir(csvFolder, pattern = 'diagnostic_summary.csv') + summary <- readr::read_csv(file.path(csvFolder, summaryName)) %>% + dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>% + dplyr::select(-"diagnostic_id") + colnames(summary) <- SqlRender::snakeCaseToCamelCase(colnames(summary)) + + result <- list( + summary = summary, + participants = participants, + predictors = predictors, + outcomes = outcomes, + designs = NULL, + modelDesign = modelDesign, + databaseSchema = databaseSchema, + databaseId = databaseId + ) + + class(result) <- 'diagnosePlp' + + return(result) +} + + +getTableNamesPlp <- function( + connectionDetails, + databaseSchema +){ + + # check some plp tables exists in databaseSchemaSettings + conn <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(conn)) + + result <- DatabaseConnector::getTableNames( + connection = conn, + databaseSchema = databaseSchema + ) + + return(tolower(result)) +} diff --git a/R/JsonHelpers.R b/R/JsonHelpers.R deleted file mode 100644 index 5ae3339f3..000000000 --- a/R/JsonHelpers.R +++ /dev/null @@ -1,78 +0,0 @@ -# code to parse the modelDesign list into json - -addAttributes <- function(x){ - attributeValues <- attributes(x) - if('names' %in% names(attributeValues)){ - attributeValues$names <- NULL - } - - if(length(attributeValues)>0){ - names(attributeValues) <- paste0('attr_', names(attributeValues)) - x <- c(x, attributeValues) - } - return(x) -} - -prepareToJson <- function(md){ - md$restrictPlpDataSettings <- addAttributes(md$restrictPlpDataSettings) - md$populationSettings <- addAttributes(md$populationSettings) - md$preprocessSettings <- addAttributes(md$preprocessSettings) - md$executeSettings <- addAttributes(md$executeSettings) - - md$modelSettings$param <- addAttributes(md$modelSettings$param) - md$modelSettings <- addAttributes(md$modelSettings) - - if(class(md$covariateSettings) == 'covariateSettings'){ - md$covariateSettings <- list(md$covariateSettings) - } - md$covariateSettings <- lapply(md$covariateSettings, function(x) addAttributes(x)) - - if(class(md$sampleSettings) == 'sampleSettings'){ - md$sampleSettings <- list(md$sampleSettings) - } - md$sampleSettings <- lapply(md$sampleSettings, function(x) addAttributes(x)) - - if(class(md$featureEngineeringSettings) == 'featureEngineeringSettings'){ - md$featureEngineeringSettings<- list(md$featureEngineeringSettings) - } - md$featureEngineeringSettings <- lapply(md$featureEngineeringSettings, function(x) addAttributes(x)) - - md <- addAttributes(md) - return(md) -} - - - -extractAttributes <- function(x){ - - ind <- grep('attr_', names(x)) - - if(length(ind)>0){ - attributeValues <- x[ind] - x <- x[-ind] - names(attributeValues) <- gsub(pattern = 'attr_',replacement = '',x = names(attributeValues)) - attributeValues$names <- names(x) - attributes(x) <- attributeValues - } - - return(x) -} - -prepareToRlist <- function(md){ - md$restrictPlpDataSettings <- extractAttributes(md$restrictPlpDataSettings) - md$populationSettings <- extractAttributes(md$populationSettings) - md$preprocessSettings <- extractAttributes(md$preprocessSettings) - md$executeSettings <- extractAttributes(md$executeSettings) - - md$modelSettings$param <- extractAttributes(md$modelSettings$param) - md$modelSettings <- extractAttributes(md$modelSettings) - - md$covariateSettings <- lapply(md$covariateSettings, function(x) extractAttributes(x)) - - md$sampleSettings <- lapply(md$sampleSettings, function(x) extractAttributes(x)) - - md$featureEngineeringSettings <- lapply(md$featureEngineeringSettings, function(x) extractAttributes(x)) - - md <- extractAttributes(md) - return(md) -} \ No newline at end of file diff --git a/R/KNN.R b/R/KNN.R index 84a74db34..834b26eaa 100644 --- a/R/KNN.R +++ b/R/KNN.R @@ -48,6 +48,7 @@ setKNN <- function(k=1000, indexFolder=file.path(getwd(),'knn'), threads = 1 ){ ) attr(param, 'settings') <- list( + modelType = 'knn', modelName = 'K Nearest Neighbors' ) @@ -63,7 +64,9 @@ setKNN <- function(k=1000, indexFolder=file.path(getwd(),'knn'), threads = 1 ){ return(result) } -fitKNN <- function(trainData, param, search = 'none', analysisId ){ +fitKNN <- function(trainData, modelSettings, search = 'none', analysisId, ...){ + + param <- modelSettings$param if (!FeatureExtraction::isCovariateData(trainData$covariateData)){ stop("Needs correct covariateData") @@ -104,52 +107,55 @@ fitKNN <- function(trainData, param, search = 'none', analysisId ){ cohort = trainData$labels, plpModel = list( model = indexFolder, - settings = list( - modelSettings = list( - model = 'knn', - param = list( - k = k, - indexFolder = indexFolder, - threads = param$threads + trainDetails = list( + finalModelParameters = list( + k = k, + threads = param$threads ) - ) - ) ) ) + ) + prediction$evaluationType <- 'Train' result <- list( model = indexFolder, + preprocessing = list( + featureEngineering = attr(trainData, "metaData")$featureEngineering,#learned mapping + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, #learned mapping + requireDenseMatrix = F + ), + prediction = prediction, - settings = list( - plpDataSettings = attr(trainData, "metaData")$plpDataSettings, + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = attr(trainData, "metaData")$targetId, + outcomeId = attr(trainData, "metaData")$outcomeId, + restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = F, populationSettings = attr(trainData, "metaData")$populationSettings, - modelSettings = list( - model = 'KNN', - param = param, - finalModelParameters = list(), - extraSettings = attr(param, 'settings') - ), + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + modelSetting = modelSettings, splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), trainDetails = list( analysisId = analysisId, - cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - outcomeId = attr(trainData, "metaData")$outcomeId, - cohortId = attr(trainData, "metaData")$cohortId, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, - trainingTime = comp, + trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), - hyperParamSearch =c() + modelName = 'KNN', + hyperParamSearch = data.frame(), + finalModelParameters = list( + k = k, + threads = param$threads + ) ), covariateImportance = variableImportance @@ -174,9 +180,9 @@ predictKnn <- function( covariates = data$covariateData$covariates, cohort = cohort[,!colnames(cohort)%in%'cohortStartDate'], indexFolder = plpModel$model, - k = plpModel$settings$modelSettings$param$k, + k = plpModel$trainDetails$finalModelParameters$k, weighted = TRUE, - threads = plpModel$settings$modelSettings$param$threads + threads = plpModel$trainDetails$finalModelParameters$threads ) # can add: threads = 1 in the future @@ -187,7 +193,7 @@ predictKnn <- function( all.x=T, fill=0) prediction$value[is.na(prediction$value)] <- 0 - attr(prediction, "metaData") <- 'binary' + attr(prediction, "metaData")$modelType <- 'binary' return(prediction) diff --git a/R/LearningCurve.R b/R/LearningCurve.R index 0ccf78aeb..863d7e778 100644 --- a/R/LearningCurve.R +++ b/R/LearningCurve.R @@ -40,7 +40,7 @@ #' \code{trainFractions}. Note, providing \code{trainEvents} will override #' your input to \code{trainFractions}. The format should be as follows: #' \itemize{ -#' \item{ \code{c(500, 1000, 1500) } - a list of training events} +#' \item \code{c(500, 1000, 1500) } - a list of training events #' } #' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data) #' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of @@ -48,17 +48,12 @@ #' and whether to normalise the covariates before training #' @param modelSettings An object of class \code{modelSettings} created using one of the function: #' \itemize{ -#' \item{setLassoLogisticRegression()}{ A lasso logistic regression model} -#' \item{setGradientBoostingMachine()}{ A gradient boosting machine} -#' \item{setAdaBoost()}{ An ada boost model} -#' \item{setRandomForest()}{ A random forest model} -#' \item{setDecisionTree()}{ A decision tree model} -#' \item{setCovNN())}{ A convolutional neural network model} -#' \item{setCIReNN()}{ A recurrent neural network model} -#' \item{setMLP()}{ A neural network model} -#' \item{setDeepNN()}{ A deep neural network model} -#' \item{setKNN()}{ A KNN model} -#' +#' \item \code{setLassoLogisticRegression()} A lasso logistic regression model +#' \item \code{setGradientBoostingMachine()} A gradient boosting machine +#' \item \code{setAdaBoost()} An ada boost model +#' \item \code{setRandomForest()} A random forest model +#' \item \code{setDecisionTree()} A decision tree model +#' \item \code{setKNN()} A KNN model #' } #' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} #' specifying how the logging is done @@ -98,7 +93,7 @@ createLearningCurve <- function( populationSettings = createStudyPopulationSettings(), splitSettings = createDefaultSplitSetting(), trainFractions = c(0.25, 0.50, 0.75), - trainEvents = c(500, 1000, 1500), + trainEvents = NULL, sampleSettings = createSampleSettings(), featureEngineeringSettings = createFeatureEngineeringSettings(), preprocessSettings = createPreprocessSettings( @@ -188,7 +183,7 @@ createLearningCurve <- function( nRuns <- length(trainFractions) settings = list( - plpData = plpData, + plpData = quote(plpData), outcomeId = outcomeId, analysisId = analysisId, populationSettings = populationSettings, @@ -238,7 +233,7 @@ createLearningCurve <- function( lcWrapper <- function(settings){ plpData <- PatientLevelPrediction::loadPlpData(settings$plpData) - settings$plpData <- plpData + settings$plpData <- quote(plpData) result <- tryCatch({do.call(runPlp, settings)}, warning = function(war) { ParallelLogger::logInfo(paste0('a warning: ', war)) @@ -282,7 +277,7 @@ getTrainFractions <- function( trainFractionsTemp <- samplesRequired/nrow(population) # filter out no. of events that would exceed the available training set size - binaryMask <- trainFractionsTemp <= (1.0 - splitSettings$testFraction) + binaryMask <- trainFractionsTemp <= (1.0 - splitSettings$test) # override any input to trainFractions with event-based training fractions trainFractions <- trainFractionsTemp[binaryMask] @@ -307,7 +302,7 @@ learningCurveHelper <- function(result, trainFractions){ result$name <- paste(result$evaluation, result$metric, sep='_') - result <- result %>% dplyr::select(.data$name, .data$value) + result <- result %>% dplyr::select("name", "value") result <- rbind( c('executionTime', executeTime), @@ -381,9 +376,9 @@ plotLearningCurve <- function(learningCurve, # create a data.frame with evalautionType, AUROC tidyLearningCurve <- learningCurve %>% dplyr::rename( - Occurrences = .data$Train_outcomeCount, - Observations = .data$Train_populationSize ) %>% - dplyr::select(.data$trainFraction, .data$Occurrences, .data$Observations, .data$Test_AUROC, .data$Train_AUROC) + Occurrences = "Train_outcomeCount", + Observations = "Train_populationSize" ) %>% + dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUROC", "Train_AUROC") for(i in 1:ncol(tidyLearningCurve)){ tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i])) @@ -407,9 +402,9 @@ plotLearningCurve <- function(learningCurve, # tidy up dataframe tidyLearningCurve <- learningCurve %>% dplyr::rename( - Occurrences = .data$Train_outcomeCount, - Observations = .data$Train_populationSize ) %>% - dplyr::select(.data$trainFraction, .data$Occurrences, .data$Observations, .data$Test_AUPRC, .data$Train_AUPRC) + Occurrences = "Train_outcomeCount", + Observations = "Train_populationSize" ) %>% + dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUPRC", "Train_AUPRC") for(i in 1:ncol(tidyLearningCurve)){ tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i])) @@ -432,9 +427,9 @@ plotLearningCurve <- function(learningCurve, # tidy up dataframe tidyLearningCurve <- learningCurve %>% dplyr::rename( - Occurrences = .data$Train_outcomeCount, - Observations = .data$Train_populationSize ) %>% - dplyr::select(.data$trainFraction, .data$Occurrences, .data$Observations, .data$`Test_brier score scaled`, .data$`Train_brier score scaled`) + Occurrences = "Train_outcomeCount", + Observations = "Train_populationSize" ) %>% + dplyr::select("trainFraction", "Occurrences", "Observations", "Test_brier score scaled", "Train_brier score scaled") for(i in 1:ncol(tidyLearningCurve)){ tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i])) @@ -470,8 +465,8 @@ plotLearningCurve <- function(learningCurve, # create plot object plot <- tidyLearningCurve %>% - ggplot2::ggplot(ggplot2::aes_string(x = abscissa, y= 'value', - col = "Dataset")) + + ggplot2::ggplot(ggplot2::aes(x = .data[[abscissa]], y = .data[['value']], + col = .data[["Dataset"]])) + ggplot2::geom_line() + ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) + ggplot2::labs(title = plotTitle, subtitle = plotSubtitle, diff --git a/R/LightGBM.R b/R/LightGBM.R new file mode 100644 index 000000000..69f1b6c6e --- /dev/null +++ b/R/LightGBM.R @@ -0,0 +1,236 @@ +# @file LightGBM.R +# Copyright 2023 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package). +#' +#' @param nthread The number of computer threads to use (how many cores do you have?) +#' @param earlyStopRound If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting) +#' @param numIterations Number of boosting iterations. +#' @param numLeaves This hyperparameter sets the maximum number of leaves. Increasing this parameter can lead to higher model complexity and potential overfitting. +#' @param maxDepth This hyperparameter sets the maximum depth . Increasing this parameter can also lead to higher model complexity and potential overfitting. +#' @param minDataInLeaf This hyperparameter sets the minimum number of data points that must be present in a leaf node. Increasing this parameter can help to reduce overfitting +#' @param learningRate This hyperparameter controls the step size at each iteration of the gradient descent algorithm. Lower values can lead to slower convergence but may result in better performance. +#' @param lambdaL1 This hyperparameter controls L1 regularization, which can help to reduce overfitting by encouraging sparse models. +#' @param lambdaL2 This hyperparameter controls L2 regularization, which can also help to reduce overfitting by discouraging large weights in the model. +#' @param scalePosWeight Controls weight of positive class in loss - useful for imbalanced classes +#' @param isUnbalance This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities. +#' @param seed An option to add a seed when training the final model +#' +#' @examples +#' model.lightgbm <- setLightGBM( +#' numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10), +#' minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3) +#' ) +#' +#' @export +setLightGBM <- function(nthread = 20, + earlyStopRound = 25, + numIterations = c(100), + numLeaves = c(31), + maxDepth = c(5, 10), + minDataInLeaf = c(20), + learningRate = c(0.05, 0.1, 0.3), + lambdaL1 = c(0), + lambdaL2 = c(0), + scalePosWeight = 1, + isUnbalance = FALSE, + seed = sample(10000000, 1)) { + ensure_installed("lightgbm") + checkIsClass(seed, c("numeric", "integer")) + + if (length(nthread) > 1) { + stop("nthread must be length 1") + } + if (!inherits(x = seed, what = c("numeric", "integer"))) { + stop("Invalid seed") + } + if(sum(numIterations < 1) > 0){ + stop('numIterations must be greater that 0') + } + if(sum(numLeaves < 2) > 0){ + stop('numLeaves must be greater that 1') + } + if(sum(numLeaves > 131072) > 0){ + stop('numLeaves must be less that or equal 131072') + } + if(sum(learningRate <= 0) > 0){ + stop('learningRate must be greater that 0') + } + if (sum(lambdaL1 < 0) > 0){ + stop('lambdaL1 must be 0 or greater') + } + if (sum(lambdaL2 < 0) > 0){ + stop('lambdaL2 must be 0 or greater') + } + if (sum(scalePosWeight < 0) > 0){ + stop('scalePosWeight must be 0 or greater') + } + if (isUnbalance == TRUE & sum(scalePosWeight != 1) > 0){ + stop('isUnbalance cannot be used at the same time with scale_pos_weight != 1, choose only one of them') + } + + paramGrid <- list( + earlyStopRound = earlyStopRound, + numIterations = numIterations, + numLeaves = numLeaves, + maxDepth = maxDepth, + minDataInLeaf = minDataInLeaf, + learningRate = learningRate, + lambdaL1 = lambdaL1, + lambdaL2 = lambdaL2, + isUnbalance = isUnbalance, + scalePosWeight = scalePosWeight + ) + + param <- listCartesian(paramGrid) + + attr(param, "settings") <- list( + modelType = "LightGBM", + seed = seed[[1]], + modelName = "LightGBM", + threads = nthread[1], + varImpRFunction = "varImpLightGBM", + trainRFunction = "fitLightGBM", + predictRFunction = "predictLightGBM" + ) + + attr(param, "saveType") <- "lightgbm" + + result <- list( + fitFunction = "fitRclassifier", + param = param + ) + + class(result) <- "modelSettings" + + return(result) +} + + + +varImpLightGBM <- function(model, + covariateMap) { + varImp <- lightgbm::lgb.importance(model, percentage = T) %>% dplyr::select("Feature", "Gain") + + varImp <- data.frame( + covariateId = gsub(".*_","",varImp$Feature), + covariateValue = varImp$Gain, + included = 1 + ) + + varImp <- merge(covariateMap, varImp, by.x = "columnId", by.y = "covariateId") + varImp <- varImp %>% + dplyr::select("covariateId", "covariateValue", "included") + + return(varImp) +} + +predictLightGBM <- function(plpModel, + data, + cohort) { + if (inherits(data, "plpData")) { + # convert + matrixObjects <- toSparseM( + plpData = data, + cohort = cohort, + map = plpModel$covariateImportance %>% + dplyr::select("columnId", "covariateId") + ) + + # use the include?? + + newData <- matrixObjects$dataMatrix + cohort <- matrixObjects$labels + } else { + newData <- data + } + + if (inherits(plpModel, "plpModel")) { + model <- plpModel$model + } else { + model <- plpModel + } + + pred <- data.frame(value = stats::predict(model, newData)) + prediction <- cohort + prediction$value <- pred$value + + prediction <- prediction %>% + dplyr::select(-"rowId") %>% + dplyr::rename(rowId = "originalRowId") + + attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType")) + + return(prediction) +} + +fitLightGBM <- function(dataMatrix, + labels, + hyperParameters, + settings) { + if (!is.null(hyperParameters$earlyStopRound)) { + trainInd <- sample(nrow(dataMatrix), nrow(dataMatrix) * 0.9) + train <- lightgbm::lgb.Dataset( + data = dataMatrix[trainInd, , drop = F], + label = labels$outcomeCount[trainInd] + ) + test <- lightgbm::lgb.Dataset( + data = dataMatrix[-trainInd, , drop = F], + label = labels$outcomeCount[-trainInd] + ) + watchlist <- list(train = train, test = test) + } else { + train <- lightgbm::lgb.Dataset( + data = dataMatrix, + label = labels$outcomeCount, + free_raw_data = FALSE, + ) + watchlist <- list() + } + + outcomes <- sum(labels$outcomeCount > 0) + N <- nrow(labels) + outcomeProportion <- outcomes / N + set.seed(settings$seed) + model <- lightgbm::lgb.train( + data = train, + params = list( + objective = "binary", + boost = "gbdt", + metric = "auc", + num_iterations = hyperParameters$numIterations, + num_leaves = hyperParameters$numLeaves, + max_depth = hyperParameters$maxDepth, + learning_rate = hyperParameters$learningRate, + feature_pre_filter=FALSE, + min_data_in_leaf = hyperParameters$minDataInLeaf, + scale_pos_weight = hyperParameters$scalePosWeight, + lambda_l1 = hyperParameters$lambdaL1, + lambda_l2 = hyperParameters$lambdaL2, + seed = settings$seed, + is_unbalance = hyperParameters$isUnbalance, + max_bin = 255, + num_threads = settings$threads + ), + verbose = 1, + early_stopping_rounds = hyperParameters$earlyStopRound, + valids = watchlist + # categorical_feature = 'auto' # future work + ) + + return(model) +} diff --git a/R/Logging.R b/R/Logging.R index 8d1ca16f3..81cd54225 100644 --- a/R/Logging.R +++ b/R/Logging.R @@ -22,12 +22,12 @@ #' #' @param verbosity Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are: #' \itemize{ -#' \item{DEBUG}{Highest verbosity showing all debug statements} -#' \item{TRACE}{Showing information about start and end of steps} -#' \item{INFO}{Show informative information (Default)} -#' \item{WARN}{Show warning messages} -#' \item{ERROR}{Show error messages} -#' \item{FATAL}{Be silent except for fatal errors} +#' \item DEBUG Highest verbosity showing all debug statements +#' \item TRACE Showing information about start and end of steps +#' \item INFO Show informative information (Default) +#' \item WARN Show warning messages +#' \item ERROR Show error messages +#' \item FATAL Be silent except for fatal errors #' } #' @param timeStamp If TRUE a timestamp will be added to each logging statement. Automatically switched on for TRACE level. #' @param logName A string reference for the logger diff --git a/R/ParamChecks.R b/R/ParamChecks.R index 1fa5772de..542e7e803 100644 --- a/R/ParamChecks.R +++ b/R/ParamChecks.R @@ -74,7 +74,7 @@ checkNotNull <- function(parameter) { checkIsClass<- function(parameter,classes) { name = deparse(substitute(parameter)) - if (!class(parameter)%in%classes) { + if (!inherits(x = parameter, what = classes)) { ParallelLogger::logError(paste0(name, ' should be of class:', classes)) stop(paste0(name, ' is wrong class')) } diff --git a/R/PatientLevelPrediction.R b/R/PatientLevelPrediction.R index 4f78d4a0f..848c42705 100644 --- a/R/PatientLevelPrediction.R +++ b/R/PatientLevelPrediction.R @@ -20,12 +20,11 @@ #' #' @description A package for running predictions using data in the OMOP CDM #' -#' @docType package #' @name PatientLevelPrediction +#' @keywords internal #' @importFrom dplyr %>% #' @importFrom rlang .data -#' @import FeatureExtraction -NULL +"_PACKAGE" #' A simulation profile #' @docType data diff --git a/R/Plotting.R b/R/Plotting.R index 1fb1dfbf1..f2fe6bf0d 100644 --- a/R/Plotting.R +++ b/R/Plotting.R @@ -62,7 +62,7 @@ outcomeSurvivalPlot <- function( if(missing(outcomeId)){ stop('outcomeId missing') } - if(class(plpData)!='plpData'){ + if(!inherits(x = plpData, what = 'plpData')){ stop('Incorrect plpData object') } if(!outcomeId%in%unique(plpData$outcomes$outcomeId)){ @@ -83,7 +83,7 @@ outcomeSurvivalPlot <- function( population$daysToEvent[is.na(population$daysToEvent)] <- population$survivalTime[is.na(population$daysToEvent)] survivalFit <- survival::survfit( - survival::Surv(daysToEvent, outcomeCount)~cohortId, + survival::Surv(daysToEvent, outcomeCount)~targetId, #riskDecile, population, conf.int = TRUE @@ -293,11 +293,11 @@ plotSparseRoc <- function( plots <- list() length(plots) <- length(evalTypes) - for(i in 1:length(evalTypes)){ + for (i in 1:length(evalTypes)){ evalType <- evalTypes[i] x <- plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$falsePositiveRate, .data$sensitivity) + dplyr::select("falsePositiveRate", "sensitivity") #x <- thresholdSummary[,c('falsePositiveRate','sensitivity')] x <- x[order(x$falsePositiveRate, x$sensitivity),] @@ -316,14 +316,14 @@ plotSparseRoc <- function( ) ) + ggplot2::geom_polygon(fill = "blue", alpha = 0.2) + - ggplot2::geom_line(size=1) + + ggplot2::geom_line(linewidth = 1) + ggplot2::geom_abline(intercept = 0, slope = 1,linetype = 2) + - ggplot2::scale_x_continuous("1 - specificity", limits=c(0,1)) + - ggplot2::scale_y_continuous("Sensitivity", limits=c(0,1)) + + ggplot2::scale_x_continuous("1 - specificity", limits = c(0,1)) + + ggplot2::scale_y_continuous("Sensitivity", limits = c(0,1)) + ggplot2::ggtitle(evalType) } - plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1) + plot <- gridExtra::marrangeGrob(plots, nrow =length(plots), ncol = 1) if (!is.null(saveLocation)){ if(!dir.exists(saveLocation)){ @@ -370,11 +370,11 @@ plotPredictedPDF <- function( x <- plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% dplyr::select( - .data$predictionThreshold, - .data$truePositiveCount, - .data$trueNegativeCount, - .data$falsePositiveCount, - .data$falseNegativeCount + "predictionThreshold", + "truePositiveCount", + "trueNegativeCount", + "falsePositiveCount", + "falseNegativeCount" ) x<- x[order(x$predictionThreshold,-x$truePositiveCount, -x$falsePositiveCount),] @@ -460,11 +460,11 @@ plotPreferencePDF <- function( x <- plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% dplyr::select( - .data$preferenceThreshold, - .data$truePositiveCount, - .data$trueNegativeCount, - .data$falsePositiveCount, - .data$falseNegativeCount + "preferenceThreshold", + "truePositiveCount", + "trueNegativeCount", + "falsePositiveCount", + "falseNegativeCount" ) x<- x[order(x$preferenceThreshold,-x$truePositiveCount, x$trueNegativeCount),] @@ -551,27 +551,27 @@ plotPrecisionRecall <- function( N <- max(plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$falseCount) %>% + dplyr::select("falseCount") %>% dplyr::pull(), na.rm = T) O <- max(plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$trueCount) %>% + dplyr::select("trueCount") %>% dplyr::pull(), na.rm = T) inc <- O/(O + N) x <- plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$positivePredictiveValue, .data$sensitivity) + dplyr::select("positivePredictiveValue", "sensitivity") plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$sensitivity, .data$positivePredictiveValue)) + - ggplot2::geom_line(size=1) + + ggplot2::geom_line(linewidth=1) + ggplot2::scale_x_continuous("Recall")+#, limits=c(0,1)) + ggplot2::scale_y_continuous("Precision") + #, limits=c(0,1)) ggplot2::geom_hline(yintercept = inc, linetype="dashed", - color = "red", size=1) + + color = "red", linewidth = 1) + ggplot2::ggtitle(evalType) } @@ -622,7 +622,7 @@ plotF1Measure <- function( x <- plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$predictionThreshold, .data$f1Score) + dplyr::select("predictionThreshold", "f1Score") if(sum(is.nan(x$f1Score))>0){ x <- x[!is.nan(x$f1Score),] @@ -630,8 +630,8 @@ plotF1Measure <- function( } plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$predictionThreshold, .data$f1Score)) + - ggplot2::geom_line(size=1) + - ggplot2::geom_point(size=1) + + ggplot2::geom_line(linewidth = 1) + + ggplot2::geom_point(size = 1) + ggplot2::scale_x_continuous("predictionThreshold")+#, limits=c(0,1)) + ggplot2::scale_y_continuous("F1Score") +#, limits=c(0,1)) ggplot2::ggtitle(evalType) @@ -688,11 +688,11 @@ plotDemographicSummary <- function( x <- plpResult$performanceEvaluation$demographicSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% dplyr::select( - .data$ageGroup, - .data$genGroup, - .data$averagePredictedProbability, - .data$PersonCountAtRisk, - .data$PersonCountWithOutcome + "ageGroup", + "genGroup", + "averagePredictedProbability", + "PersonCountAtRisk", + "PersonCountWithOutcome" ) # remove -1 values: @@ -729,10 +729,10 @@ plotDemographicSummary <- function( ci <- plpResult$performanceEvaluation$demographicSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% dplyr::select( - .data$ageGroup, - .data$genGroup, - .data$averagePredictedProbability, - .data$StDevPredictedProbability + "ageGroup", + "genGroup", + "averagePredictedProbability", + "StDevPredictedProbability" ) ci$StDevPredictedProbability[is.na(ci$StDevPredictedProbability)] <- 1 @@ -769,7 +769,7 @@ plotDemographicSummary <- function( guide = ggplot2::guide_legend(title = NULL), labels = c("Expected", "Observed")) + - ggplot2::guides(linetype=FALSE) + + ggplot2::guides( linetype = "none") + # change from FALSE due to warning ggplot2::ggtitle(evalType) } @@ -821,7 +821,7 @@ plotSparseCalibration <- function( evalType <- evalTypes[i] x <- plpResult$performanceEvaluation$calibrationSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$averagePredictedProbability, .data$observedIncidence) + dplyr::select("averagePredictedProbability", "observedIncidence") maxVal <- max(x$averagePredictedProbability,x$observedIncidence) model <- stats::lm(observedIncidence~averagePredictedProbability, data=x) @@ -844,9 +844,9 @@ plotSparseCalibration <- function( )) + ggplot2::geom_ribbon(ggplot2::aes(ymin=.data$lci,ymax=.data$uci, x=x), fill="blue", alpha=0.2) + - ggplot2::geom_point(size=1, color='darkblue') + + ggplot2::geom_point(size = 1, color='darkblue') + ggplot2::coord_cartesian(ylim = c(0, maxVal), xlim =c(0,maxVal)) + - ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2, size=1, + ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2, linewidth = 1, show.legend = TRUE) + ggplot2::geom_abline(intercept = res['Intercept'], slope = res['Gradient'], linetype = 1,show.legend = TRUE, @@ -903,7 +903,7 @@ plotSparseCalibration2 <- function( evalType <- evalTypes[i] x <- plpResult$performanceEvaluation$calibrationSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$averagePredictedProbability, .data$observedIncidence, .data$PersonCountAtRisk) + dplyr::select("averagePredictedProbability", "observedIncidence", "PersonCountAtRisk") cis <- apply(x, 1, function(x) stats::binom.test(round(x[2]*x[3]), x[3], alternative = c("two.sided"), conf.level = 0.95)$conf.int) x$lci <- cis[1,] @@ -919,7 +919,7 @@ plotSparseCalibration2 <- function( ggplot2::geom_point(size=2, color='black') + ggplot2::geom_errorbar(limits) + ggplot2::geom_line(colour='darkgrey') + - ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 5, size=0.4, + ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 5, linewidth=0.4, show.legend = TRUE) + ggplot2::scale_x_continuous("Average Predicted Probability") + ggplot2::scale_y_continuous("Observed Fraction With Outcome") + @@ -993,7 +993,7 @@ plotSmoothCalibration <- function(plpResult, if('prediction'%in%names(plpResult)) { x <- plpResult$performanceEvaluation$calibrationSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% - dplyr::select(.data$averagePredictedProbability, .data$observedIncidence) + dplyr::select("averagePredictedProbability", "observedIncidence") prediction <- plpResult$prediction %>% dplyr::filter(.data$evaluationType == evalType) @@ -1042,39 +1042,28 @@ plotSmoothCalibration <- function(plpResult, } # loess smoothData <- data.frame(y, p) - fit <- stats::loess(y ~ p, degree = 2) - predictedFit <- stats::predict(fit, se = TRUE) - smoothData <- smoothData %>% - dplyr::mutate( - calibration = predictedFit$fit, - se = predictedFit$se, - lci = .data$calibration - stats::qt(.975, predictedFit$df) * .data$se, - uci = .data$calibration + stats::qt(.975, predictedFit$df) * .data$se - ) - - xlim <- ylim <- c(0, 1) + # xlim <- ylim <- c(0, 1) smoothPlot <- plotSmoothCalibrationLoess(data = smoothData, span = span) + ggplot2::coord_cartesian( - xlim = xlim, - ylim = ylim + xlim = c(0, maxes), + ylim = c(0, maxes) ) } else { # Restricted cubic splines smoothData <- data.frame(y, p) - smoothPlot <- plotSmoothCalibrationRcs(data = smoothData, nKnots = nKnots) + smoothPlot <- plotSmoothCalibrationRcs(data = smoothData, numberOfKnots = nKnots) if (is.character(smoothPlot)) { plots[[i]] <- smoothPlot failedEvalType[evalTypes[i]] <- TRUE next } - xlim <- ylim <- c(0, 1) + smoothPlot <- smoothPlot + ggplot2::coord_cartesian( - xlim = xlim, - ylim = ylim + xlim = c(0, maxes), + ylim = c(0, maxes) ) - } # construct the plot grid if (scatter) { @@ -1112,7 +1101,7 @@ plotSmoothCalibration <- function(plpResult, strip.text = ggplot2::element_blank() ) + ggplot2::labs(x = "Predicted Probability") + - ggplot2::coord_cartesian(xlim = xlim) + ggplot2::coord_cartesian(xlim = c(0, maxes)) } else { # use calibrationSummary @@ -1259,15 +1248,14 @@ plotSmoothCalibrationLoess <- function(data, span = 0.75) { fill = "blue", alpha = 0.2 ) + - ggplot2::geom_segment( - ggplot2::aes( - x = 0, - xend = 1, - y = 0, - yend = 1, - color = "Ideal", - linetype = "Ideal" - ) + ggplot2::annotate( + geom = "segment", + x = 0, + xend = 1, + y = 0, + yend = 1, + color = "red", + linetype = "dashed" ) + ggplot2::scale_linetype_manual( name = "Models", @@ -1291,36 +1279,92 @@ plotSmoothCalibrationLoess <- function(data, span = 0.75) { return(plot) } -plotSmoothCalibrationRcs <- function(data, nKnots) { +plotSmoothCalibrationRcs <- function(data, numberOfKnots) { + data <- data %>% + dplyr::filter(!is.na(.data$y) & !is.na(.data$p)) p <- data$p - for (k in nKnots:3) { - formSmooth <- paste0('y ~ rms::rcs(p, ', k, ')') - smoothFit <- suppressWarnings(rms::lrm(stats::as.formula(formSmooth), data = data, x = TRUE, y = TRUE)) - smoothFitFail <- smoothFit$fail - if (smoothFitFail) { + + .defineKnots <- function(predictedProbabilities, numberOfKnots) { + if (numberOfKnots == 3) { + lowestQuantile <- .1 + highestQuantile <- .9 + } else if (numberOfKnots > 3 & numberOfKnots <= 6) { + lowestQuantile <- .05 + highestQuantile <- .95 + } else if (numberOfKnots == 7) { + lowestQuantile <- .025 + highestQuantile <- .975 + } else { + # use mgcv defaults + return(numberOfKnots) + } + knotQuantiles <- seq( + lowestQuantile, + highestQuantile, + length.out = numberOfKnots + ) + + knotLocation <- stats::quantile( + x = predictedProbabilities, + probs = knotQuantiles, + na.rm = TRUE + ) + + return(knotLocation) + } + + for (k in numberOfKnots:3) { + if (k > 7) { + smoothFit <- tryCatch( + expr = { + mgcv::gam( + y ~ s(p, bs = 'cr', k = k, m = 2), + data = data, + family = stats::binomial() + ) + }, + error = function(e) { + return("Failed") + } + ) + } else { + smoothFit <- tryCatch( + expr = { + mgcv::gam( + y ~ s(p, bs = 'cr', k = k, m = 2), + data = data, + knots = list(p = .defineKnots(p, k)), + family = stats::binomial() + ) + }, + error = function(e) { + return("Failed") + } + ) + } + if (is.character(smoothFit)) { if (k > 3) { ParallelLogger::logInfo(paste0("Setting number of Knots to ", k, " led to estimation problems. Switching to nKnots = ", k-1)) } else { ParallelLogger::logInfo(paste0('Unable to fit model')) - plot <- "Failed" } + } else { + break } } - # If the fit failed for all nKnots return "Failed" - if (smoothFitFail) { - return(plot) + if (is.character(smoothFit)){ + return("Failed") } - xRange <- seq(0, p[length(p)], length.out = 1000) - pred <- stats::predict(smoothFit, xRange, se.fit = T, type = "lp") - predXRange <- stats::plogis(pred$linear.predictors) - ciSmooth <- data.frame( - lci = stats::plogis(pred$linear.predictors - 1.96 * pred$se.fit), - uci = stats::plogis(pred$linear.predictors + 1.96 * pred$se.fit) + xRange <- seq(min(p), max(p), length.out = 1e3) + predictWithSe <- stats::predict(smoothFit, newdata = data.frame(p = xRange), se.fit = TRUE) + smoothData <- data.frame( + xRange = xRange, + predXRange = stats::plogis(predictWithSe$fit), + lci = stats::plogis(predictWithSe$fit - 1.96 * predictWithSe$se.fit), + uci = stats::plogis(predictWithSe$fit + 1.96 * predictWithSe$se.fit) ) - - smoothData <- cbind(xRange, predXRange, ciSmooth) plot <- ggplot2::ggplot( data = smoothData, ggplot2::aes( @@ -1340,17 +1384,17 @@ plotSmoothCalibrationRcs <- function(data, nKnots) { ymax = .data$uci ), fill = "blue", - alpha = 0.2 + alpha = 0.2, + show.legend = FALSE ) + - ggplot2::geom_segment( - ggplot2::aes( - x = 0, - xend = 1, - y = 0, - yend = 1, + ggplot2::geom_abline( + mapping = ggplot2::aes( + slope = 1, + intercept = 0, color = "Ideal", linetype = "Ideal" - ) + ), + show.legend = FALSE ) + ggplot2::scale_color_manual( name = "Models", @@ -1361,7 +1405,7 @@ plotSmoothCalibrationRcs <- function(data, nKnots) { values = c(rcs = "solid", Ideal = "dashed") ) + ggplot2::labs(x = "", y = "Observed Probability") - + return(plot) } @@ -1397,43 +1441,44 @@ plotPredictionDistribution <- function( plots <- list() length(plots) <- length(evalTypes) - for(i in 1:length(evalTypes)){ + for (i in 1:length(evalTypes)) { evalType <- evalTypes[i] x <- plpResult$performanceEvaluation$predictionDistribution %>% dplyr::filter(.data[[typeColumn]] == evalType) - non05 <- x$P05PredictedProbability[x$class==0] - non95 <- x$P95PredictedProbability[x$class==0] - one05 <- x$P05PredictedProbability[x$class==1] - one95 <- x$P95PredictedProbability[x$class==1] - - plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(x=as.factor(.data$class), - ymin=.data$MinPredictedProbability, - lower=.data$P25PredictedProbability, - middle=.data$MedianPredictedProbability, - upper=.data$P75PredictedProbability, - ymax=.data$MaxPredictedProbability, - color=as.factor(.data$class))) + + non05 <- x$P05PredictedProbability[x$class == 0] + non95 <- x$P95PredictedProbability[x$class == 0] + one05 <- x$P05PredictedProbability[x$class == 1] + one95 <- x$P95PredictedProbability[x$class == 1] + + plots[[i]] <- ggplot2::ggplot(x, + ggplot2::aes(x = as.factor(class), + ymin = .data$MinPredictedProbability, + lower = .data$P25PredictedProbability, + middle = .data$MedianPredictedProbability, + upper = .data$P75PredictedProbability, + ymax = .data$MaxPredictedProbability, + color = as.factor(.data$class))) + ggplot2::coord_flip() + - ggplot2::geom_boxplot(stat="identity") + + ggplot2::geom_boxplot(stat = "identity") + ggplot2::scale_x_discrete("Class") + ggplot2::scale_y_continuous("Predicted Probability") + - ggplot2::theme(legend.position="none") + - ggplot2::geom_segment(ggplot2::aes(x = 0.9, y = non05, - xend = 1.1, yend = non05), color='red') + - ggplot2::geom_segment(ggplot2::aes(x = 0.9, y = non95, - xend = 1.1, yend = non95), color='red') + - ggplot2::geom_segment(ggplot2::aes(x = 1.9, y = one05, - xend = 2.1, yend = one05)) + - ggplot2::geom_segment(ggplot2::aes(x = 1.9, y = one95, - xend = 2.1, yend = one95)) + - ggplot2::ggtitle(evalType) + ggplot2::theme(legend.position = "none") + + ggplot2::annotate("segment", x = 0.9, xend = 1.1, y = non05, yend = non05, + color = "red") + + ggplot2::annotate("segment", x = 0.9, xend = 1.1, y = non95, yend = non95, + color = "red") + + ggplot2::annotate("segment", x = 1.9, xend = 2.1, y = one05, yend = one05, + color = "#00BFC4") + + ggplot2::annotate("segment", x = 1.9, xend = 2.1, y = one95, yend = one95, + color = "#00BFC4") + + ggplot2::ggtitle(evalType) } - plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1) + plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1) - if (!is.null(saveLocation)){ - if(!dir.exists(saveLocation)){ + if (!is.null(saveLocation)) { + if (!dir.exists(saveLocation)) { dir.create(saveLocation, recursive = T) } ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400) diff --git a/R/PopulationSettings.R b/R/PopulationSettings.R index 63d9685be..098a784f7 100644 --- a/R/PopulationSettings.R +++ b/R/PopulationSettings.R @@ -134,7 +134,8 @@ createStudyPopulationSettings <- function( startAnchor = startAnchor, riskWindowEnd = riskWindowEnd, endAnchor = endAnchor, - restrictTarToCohortEnd = restrictTarToCohortEnd) + restrictTarToCohortEnd = restrictTarToCohortEnd + ) class(result) <- 'populationSettings' return(result) @@ -190,7 +191,7 @@ createStudyPopulation <- function( restrictTarToCohortEnd <- populationSettings$restrictTarToCohortEnd # parameter checks - if(!class(plpData)%in%c('plpData')){ + if(!inherits(x = plpData, what = c('plpData'))){ ParallelLogger::logError('Check plpData format') stop('Wrong plpData input') } @@ -214,13 +215,16 @@ createStudyPopulation <- function( if (is.null(population)) { population <- plpData$cohorts + } else { + population <- plpData$cohorts %>% + dplyr::filter(.data$rowId %in% (population %>% dplyr::pull(.data$rowId))) } - # save the metadata (should have the cohortId, outcomeId, plpDataSettings and population settings) + # save the metadata (should have the ?targetId, outcomeId, plpDataSettings and population settings) metaData <- attr(population, "metaData") - metaData$plpDataSettings <- plpData$metaData$restrictPlpDataSettings + metaData$restrictPlpDataSettings <- plpData$metaData$restrictPlpDataSettings metaData$outcomeId <- outcomeId - metaData$populationSettings <- populationSettings + metaData$populationSettings <- populationSettings # this will overwrite an existing setting # set the existing attrition if(is.null(metaData$attrition)){ @@ -258,10 +262,10 @@ createStudyPopulation <- function( # get the outcomes during TAR - outcomeTAR <- population %>% - dplyr::inner_join(plpData$outcomes, by ='rowId') %>% - dplyr::filter(.data$outcomeId == get('oId')) %>% - dplyr::select(.data$rowId, .data$daysToEvent, .data$tarStart, .data$tarEnd) %>% + outcomeTAR <- plpData$outcomes %>% + dplyr::filter(.data$outcomeId == get("oId")) %>% + dplyr::inner_join(population, by = "rowId") %>% + dplyr::select("rowId", "daysToEvent", "tarStart", "tarEnd") %>% dplyr::filter(.data$daysToEvent >= .data$tarStart & .data$daysToEvent <= .data$tarEnd) # prevent warnings when no results left @@ -270,11 +274,11 @@ createStudyPopulation <- function( dplyr::group_by(.data$rowId) %>% dplyr::summarise(first = min(.data$daysToEvent), ocount = length(unique(.data$daysToEvent))) %>% - dplyr::select(.data$rowId, .data$first, .data$ocount) + dplyr::select("rowId", "first", "ocount") } else { outcomeTAR <- outcomeTAR %>% dplyr::mutate(first = 0, ocount = 0) %>% - dplyr::select(.data$rowId, .data$first, .data$ocount) + dplyr::select("rowId", "first", "ocount") } population <- population %>% @@ -293,10 +297,12 @@ createStudyPopulation <- function( if (firstExposureOnly) { ParallelLogger::logTrace(paste("Restricting to first exposure")) - population <- population %>% - dplyr::arrange(.data$subjectId,.data$cohortStartDate) %>% - dplyr::group_by(.data$subjectId) %>% - dplyr::filter(dplyr::row_number(.data$subjectId)==1) + if (nrow(population) > dplyr::n_distinct(population$subjectId)) { + population <- population %>% + dplyr::arrange(.data$subjectId,.data$cohortStartDate) %>% + dplyr::group_by(.data$subjectId) %>% + dplyr::filter(dplyr::row_number(.data$subjectId)==1) + } attrRow <- population %>% dplyr::group_by() %>% dplyr::summarise(outcomeId = get('oId'), @@ -328,17 +334,17 @@ createStudyPopulation <- function( ParallelLogger::logTrace("Removing subjects with prior outcomes (if any)") # get the outcomes during TAR - outcomeBefore <- population %>% - dplyr::inner_join(plpData$outcomes, by ='rowId') %>% + outcomeBefore <- plpData$outcomes %>% dplyr::filter(outcomeId == get('oId')) %>% - dplyr::select(.data$rowId, .data$daysToEvent, .data$tarStart) %>% + dplyr::inner_join(population, by = 'rowId') %>% + dplyr::select("rowId", "daysToEvent", "tarStart") %>% dplyr::filter(.data$daysToEvent < .data$tarStart & .data$daysToEvent > -get('priorOutcomeLookback') ) if(nrow(as.data.frame(outcomeBefore))>0){ outcomeBefore %>% dplyr::group_by(.data$rowId) %>% dplyr::summarise(first = min(.data$daysToEvent)) %>% - dplyr::select(.data$rowId) + dplyr::select("rowId") } population <- population %>% @@ -418,9 +424,9 @@ createStudyPopulation <- function( dplyr::mutate(timeAtRisk = .data$tarEnd - .data$tarStart + 1 , survivalTime = ifelse(.data$outcomeCount == 0, .data$tarEnd -.data$tarStart + 1, .data$first - .data$tarStart + 1), daysToEvent = .data$first) %>% - dplyr::select(.data$rowId, .data$subjectId, .data$cohortId, .data$cohortStartDate, .data$daysFromObsStart, - .data$daysToCohortEnd, .data$daysToObsEnd, .data$ageYear, .data$gender, - .data$outcomeCount, .data$timeAtRisk, .data$daysToEvent, .data$survivalTime) + dplyr::select("rowId", "subjectId", "targetId", "cohortStartDate", "daysFromObsStart", + "daysToCohortEnd", "daysToObsEnd", "ageYear", "gender", + "outcomeCount", "timeAtRisk", "daysToEvent", "survivalTime") # check outcome still there if(sum(!is.na(population$daysToEvent))==0){ diff --git a/R/Predict.R b/R/Predict.R index 0c17b88a2..14eff8787 100644 --- a/R/Predict.R +++ b/R/Predict.R @@ -44,24 +44,34 @@ predictPlp <- function(plpModel, plpData, population, timepoint){ # do feature engineering/selection - plpData$covariateData <- do.call( - applyFeatureengineering, - list( - covariateData = plpData$covariateData, - settings = plpModel$settings$featureEngineering + if(!is.null(plpModel$preprocessing$featureEngineering)){ + plpData <- do.call( + applyFeatureengineering, + list( + plpData = plpData, + settings = plpModel$preprocessing$featureEngineering + ) ) - ) + featureEngineering <- T + } else{ + featureEngineering <- F + } ParallelLogger::logTrace('did FE') - # do preprocessing - plpData$covariateData <- do.call( - applyTidyCovariateData, - list( - covariateData = plpData$covariateData, - preprocessSettings = plpModel$settings$tidyCovariates + if(!is.null(plpModel$preprocessing$tidyCovariates)){ + # do preprocessing + plpData$covariateData <- do.call( + applyTidyCovariateData, + list( + covariateData = plpData$covariateData, + preprocessSettings = plpModel$preprocessing$tidyCovariates + ) ) - ) + tidyCovariates <- T + } else{ + tidyCovariates <- F + } ParallelLogger::logTrace('did tidy') @@ -90,10 +100,14 @@ predictPlp <- function(plpModel, plpData, population, timepoint){ # add metaData metaData$modelType <- attr(plpModel, 'modelType') #"binary", - metaData$cohortId <- attr(population,'metaData')$cohortId + metaData$targetId <- attr(population,'metaData')$targetId metaData$outcomeId <- attr(population,'metaData')$outcomeId metaData$timepoint <- timepoint + # added information about running preprocessing/FE + metaData$tidyCovariates <- tidyCovariates + metaData$featureEngineering <- featureEngineering + attr(prediction, "metaData") <- metaData return(prediction) } @@ -101,7 +115,7 @@ predictPlp <- function(plpModel, plpData, population, timepoint){ applyFeatureengineering <- function( - covariateData, + plpData, settings ){ @@ -112,12 +126,12 @@ applyFeatureengineering <- function( # add code for implementing the feature engineering for(set in settings){ - set$settings$trainData <- covariateData - covariateData <- do.call(eval(parse(text = set$funct)), set$settings) + set$settings$trainData <- plpData + plpData <- do.call(eval(parse(text = set$funct)), set$settings) } # dont do anything for now - return(covariateData) + return(plpData) } @@ -154,10 +168,10 @@ applyTidyCovariateData <- function( if(!is.null(maxs)){ if('bins'%in%colnames(maxs)){ - covariateData$maxes <- tibble::as_tibble(maxs) %>% dplyr::rename(covariateId = .data$bins) %>% - dplyr::rename(maxValue = .data$maxs) + covariateData$maxes <- dplyr::as_tibble(maxs) %>% dplyr::rename(covariateId = "bins") %>% + dplyr::rename(maxValue = "maxs") } else{ - covariateData$maxes <- maxs #tibble::as_tibble(maxs) %>% dplyr::rename(covariateId = bins) + covariateData$maxes <- maxs } on.exit(covariateData$maxes <- NULL, add = TRUE) @@ -170,8 +184,8 @@ applyTidyCovariateData <- function( dplyr::inner_join(covariateData$includeCovariates, by='covariateId') %>% # added as join dplyr::inner_join(covariateData$maxes, by = 'covariateId') %>% dplyr::mutate(value = 1.0*.data$covariateValue/.data$maxValue) %>% - dplyr::select(- .data$covariateValue) %>% - dplyr::rename(covariateValue = .data$value) + dplyr::select(-"covariateValue") %>% + dplyr::rename(covariateValue = "value") } else{ newCovariateData$covariates <- covariateData$covariates %>% dplyr::inner_join(covariateData$includeCovariates, by='covariateId') diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index d0e78b485..834d27a85 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -#' Create the settings for preprocessing the trainData using \code{ }. +#' Create the settings for preprocessing the trainData. #' #' @details #' Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data @@ -66,6 +66,7 @@ preprocessData <- function (covariateData, preprocessSettings){ metaData <- attr(covariateData, "metaData") + preprocessSettingsInput <- preprocessSettings # saving this before adding covariateData checkIsClass(covariateData, c("CovariateData")) checkIsClass(preprocessSettings, c("preprocessSettings")) @@ -77,15 +78,17 @@ preprocessData <- function (covariateData, preprocessSettings$covariateData <- covariateData covariateData <- do.call(FeatureExtraction::tidyCovariateData, preprocessSettings) - #update covariateRed - removed <- unique( + #update covariateRef + removed <- unique(c( attr(covariateData, "metaData")$deletedInfrequentCovariateIds, attr(covariateData, "metaData")$deletedRedundantCovariateIds ) + ) covariateData$covariateRef <- covariateData$covariateRef %>% dplyr::filter(!.data$covariateId %in% removed) metaData$tidyCovariateDataSettings <- attr(covariateData, "metaData") + metaData$preprocessSettings <- preprocessSettingsInput attr(covariateData, "metaData") <- metaData return(covariateData) diff --git a/R/RClassifier.R b/R/RClassifier.R index f08ed9f89..a74e2358b 100644 --- a/R/RClassifier.R +++ b/R/RClassifier.R @@ -1,5 +1,12 @@ # this is a generic wrapper for training models using classifiers in R -fitRclassifier <- function(trainData, param, search = 'grid', analysisId){ +fitRclassifier <- function( + trainData, + modelSettings, + search = 'grid', + analysisId, + ...){ + + param <- modelSettings$param if (!FeatureExtraction::isCovariateData(trainData$covariateData)){ stop("Needs correct covariateData") @@ -52,34 +59,37 @@ fitRclassifier <- function(trainData, param, search = 'grid', analysisId){ result <- list( model = cvResult$model, + preprocessing = list( + featureEngineering = attr(trainData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = F + ), + prediction = prediction, - settings = list( - plpDataSettings = attr(trainData, "metaData")$plpDataSettings, + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = attr(trainData, "metaData")$targetId, + outcomeId = attr(trainData, "metaData")$outcomeId, + restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - #covariateMap = covariateMap, this is in covariateImportance - requireDenseMatrix = F, populationSettings = attr(trainData, "metaData")$populationSettings, - modelSettings = list( - model = attr(param, 'settings')$trainRFunction, - param = param, - finalModelParameters = cvResult$finalParam, - extraSettings = attr(param, 'settings') - ), + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + modelSettings = modelSettings, splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), trainDetails = list( analysisId = analysisId, - cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - outcomeId = attr(trainData, "metaData")$outcomeId, - cohortId = attr(trainData, "metaData")$cohortId, + analysisSource = '', #TODO add from model + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, - trainingTime = comp, + trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), + modelName = attr(param, 'settings')$trainRFunction, + finalModelParameters = cvResult$finalParam, hyperParamSearch = hyperSummary ), diff --git a/R/Recalibration.R b/R/Recalibration.R index bc2493e49..2053f7ed7 100644 --- a/R/Recalibration.R +++ b/R/Recalibration.R @@ -39,17 +39,20 @@ recalibratePlpRefit <- function( newPopulation, newData ){ - if (is.null(newPopulation)) + if (is.null(newPopulation)){ stop("NULL population") - if (class(newData) != "plpData") + } + if (!inherits(x = newData, what = "plpData")){ stop("Incorrect plpData class") - if (class(plpModel) != "plpModel") + } + if (!inherits(x = plpModel, what = "plpModel")){ stop("plpModel is not of class plpModel") + } #get selected covariates includeCovariateIds <- plpModel$covariateImportance %>% dplyr::filter(.data$covariateValue != 0) %>% - dplyr::select(.data$covariateId) %>% + dplyr::select("covariateId") %>% dplyr::pull() # check which covariates are included in new data @@ -61,27 +64,42 @@ recalibratePlpRefit <- function( setLassoRefit <- setLassoLogisticRegression( includeCovariateIds = includeCovariateIds, - noShrinkage = noShrinkage + noShrinkage = noShrinkage, + maxIterations = 10000 # increasing this due to test code often not converging ) - newData$labels <- newPopulation #%>% - #dplyr::select( - # .data$rowId, - # .data$cohortStartDate, - # .data$outcomeCount, - # .data$survivalTime - #) + newData$labels <- newPopulation newData$folds <- data.frame( rowId = newData$labels$rowId, index = sample(2, length(newData$labels$rowId), replace = T) ) - newModel <- fitPlp( - trainData = newData, - modelSettings = setLassoRefit, - analysisId = 'recalibrationRefit' + # add dummy settings to fit model + attr(newData, "metaData")$outcomeId <- attr(newPopulation, 'metaData')$outcomeId + attr(newData, "metaData")$targetId <- attr(newPopulation, 'metaData')$targetId + attr(newData, "metaData")$restrictPlpDataSettings <- attr(newPopulation, 'metaData')$restrictPlpDataSettings + attr(newData, "metaData")$covariateSettings <- newData$metaData$covariateSettings + attr(newData, "metaData")$populationSettings <- attr(newPopulation, 'metaData')$populationSettings + attr(newData$covariateData, "metaData")$featureEngineeringSettings <- PatientLevelPrediction::createFeatureEngineeringSettings() + attr(newData$covariateData, "metaData")$preprocessSettings <- PatientLevelPrediction::createPreprocessSettings() + attr(newData, "metaData")$splitSettings <- PatientLevelPrediction::createDefaultSplitSetting() + attr(newData, "metaData")$sampleSettings <- PatientLevelPrediction::createSampleSettings() + + newModel <- tryCatch({ + fitPlp( + trainData = newData, + modelSettings = setLassoRefit, + analysisId = 'recalibrationRefit', + analysisPath = NULL ) + }, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + if(is.null(newModel)){ + ParallelLogger::logInfo('Recalibration fit failed') + return(NULL) + } newModel$prediction$evaluationType <- 'recalibrationRefit' @@ -103,8 +121,8 @@ recalibratePlpRefit <- function( adjust <- newModel$covariateImportance %>% dplyr::filter(.data$covariateValue != 0) %>% dplyr::select( - .data$covariateId, - .data$covariateValue + "covariateId", + "covariateValue" ) } else{ adjust <- c() @@ -139,12 +157,13 @@ recalibratePlpRefit <- function( recalibratePlp <- function(prediction, analysisId, typeColumn = 'evaluationType', method = c('recalibrationInTheLarge', 'weakRecalibration')){ # check input: - if (class(prediction) != 'data.frame') + if (!inherits(x = prediction, what = 'data.frame')){ stop("Incorrect prediction") + } - if(!method %in% c('recalibrationInTheLarge', 'weakRecalibration')) + if(!method %in% c('recalibrationInTheLarge', 'weakRecalibration')){ stop("Unknown recalibration method type. must be of type: recalibrationInTheLarge, weakRecalibration") - + } prediction <- do.call(method, list(prediction = prediction, columnType = typeColumn)) @@ -210,7 +229,8 @@ weakRecalibration <- function(prediction, columnType = 'evaluationType'){ recalibrated <- prediction - baseline <- ifelse(is.null(attr(recalibrated, "baselineHazard")), 0.9, attr(recalibrated, "baselineHazard")) + # this will make the recalibration work if the baselineSurvival is missing + baseline <- ifelse(is.null(attr(recalibrated, "baselineSurvival")), 0.9, attr(recalibrated, "baselineSurvival")) ParallelLogger::logInfo(paste0('recal initial baseline hazard: ',baseline)) offset <- ifelse(is.null(attr(recalibrated, "offset")), 0, attr(recalibrated, "offset")) diff --git a/R/RunMultiplePlp.R b/R/RunMultiplePlp.R index e9efd42c6..9e90f5452 100644 --- a/R/RunMultiplePlp.R +++ b/R/RunMultiplePlp.R @@ -25,15 +25,15 @@ #' @param databaseDetails The database settings created using \code{createDatabaseDetails()} #' @param modelDesignList A list of model designs created using \code{createModelDesign()} #' @param onlyFetchData Only fetches and saves the data object to the output folder without running the analysis. -#' @param splitSettings The train/validation/test splitting used by all analyses created using \code{createDefaultSplitSetting()} #' @param cohortDefinitions A list of cohort definitions for the target and outcome cohorts -#' @param logSettings The setting spexcifying the logging for the analyses created using \code{createLogSettings()} +#' @param logSettings The setting specifying the logging for the analyses created using \code{createLogSettings()} #' @param saveDirectory Name of the folder where all the outputs will written to. +#' @param sqliteLocation (optional) The location of the sqlite database with the results #' #' @return #' A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier -#' for a set of analysis choices.\cr \verb{cohortId} \tab The ID of the target cohort populations.\cr -#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved \cr \verb{evaluationFolder} \tab The name of file containing the evaluation saved as a csv +#' for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr +#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved #' \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr } #' #' @export @@ -44,104 +44,142 @@ runMultiplePlp <- function( createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression()) ), onlyFetchData = F, - splitSettings = createDefaultSplitSetting( - type = "stratified", - testFraction = 0.25, - trainFraction = 0.75, - splitSeed = 123, - nfold = 3 - ), cohortDefinitions = NULL, logSettings = createLogSettings( verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log" ), - saveDirectory = getwd() + saveDirectory = getwd(), + sqliteLocation = file.path(saveDirectory, 'sqlite') ){ #input checks checkIsClass(databaseDetails, c('databaseDetails')) checkIsClass(modelDesignList, c('list', 'modelDesign')) checkIsClass(onlyFetchData, 'logical') - checkIsClass(splitSettings, 'splitSettings') checkIsClass(logSettings, 'logSettings') checkIsClass(saveDirectory, 'character') if(!dir.exists(saveDirectory)){ dir.create(saveDirectory, recursive = T) } - # get idList - idList <- getidList(modelDesignList = modelDesignList) - - # get settings data.frame - settingstable <- getSettingsTable( - modelDesignList = modelDesignList, - idList = idList - ) + settingstable <- convertToJson(modelDesignList,cohortDefinitions) - if(!is.null(cohortDefinitions)){ - cohortNames <- data.frame( - targetName = getNames(cohortDefinitions, settingstable$targetId), - outcomeName = getNames(cohortDefinitions, settingstable$outcomeId) - ) - settingstable <- cbind(cohortNames, settingstable) + if(nrow(settingstable) != length(modelDesignList)){ + stop('Error in settingstable') } - utils::write.csv(settingstable, file.path(saveDirectory,'settings.csv'), row.names = F) - saveJsonFile(idList, file.path(saveDirectory,'settings.json')) - - # list(targetId, covariateSetting, outcomeIds, saveLocation) - dataSettings <- getDataSettings(settingstable) + # save the settings - TODO change this to save jsons in csv + utils::write.csv( + x = settingstable %>% dplyr::select( + "analysisId", + "targetId", + "targetName", + "outcomeId", + "outcomeName", + "dataLocation" + ), + file.path(saveDirectory,'settings.csv'), + row.names = F + ) + + # group the outcomeIds per combination of data extraction settings + dataSettings <- settingstable %>% + dplyr::group_by( + .data$targetId, + .data$covariateSettings, + .data$restrictPlpDataSettings, + .data$dataLocation + ) %>% + dplyr::summarise( + outcomeIds = paste(unique(.data$outcomeId), collapse = ',') + ) # extract data - for(i in 1:length(dataSettings)){ - dataExists <- length(dir(file.path(saveDirectory, dataSettings[[i]]$dataLocation)))>0 + for(i in 1:nrow(as.data.frame(dataSettings))){ + dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i])))>0 if(!dataExists){ - ParallelLogger::logInfo(paste('Extracting data for cohort', dataSettings[[i]]$targetId, 'to', file.path(saveDirectory, dataSettings[[i]]$dataLocation))) + ParallelLogger::logInfo(paste('Extracting data for cohort', dataSettings$targetId[i], 'to', file.path(saveDirectory, dataSettings$dataLocation[i]))) - databaseDetails$cohortId <- dataSettings[[i]]$targetId - databaseDetails$outcomeIds <- dataSettings[[i]]$outcomeIds + databaseDetails$targetId <- dataSettings$targetId[i] + databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ',')[[1]] plpDataSettings <- list( databaseDetails = databaseDetails, - covariateSettings = getSettingFromId(idList, type = 'covariateSettings', dataSettings[[i]]$covariateSettings), - restrictPlpDataSettings = getSettingFromId(idList, type = 'restrictPlpDataSettings', dataSettings[[i]]$restrictPlpDataSettings) - ) - + covariateSettings = ParallelLogger::convertJsonToSettings(dataSettings$covariateSettings[i]), + restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(dataSettings$restrictPlpDataSettings[i]) + ) plpData <- tryCatch( {do.call(getPlpData, plpDataSettings)}, error = function(e){ParallelLogger::logInfo(e); return(NULL)} ) if(!is.null(plpData)){ - savePlpData(plpData, file.path(saveDirectory, dataSettings[[i]]$dataLocation)) + savePlpData(plpData, file.path(saveDirectory, dataSettings$dataLocation[i])) } } else{ - ParallelLogger::logInfo(paste('Data for cohort', dataSettings[[i]]$targetId, 'exists at', file.path(saveDirectory, dataSettings[[i]]$dataLocation))) + ParallelLogger::logInfo(paste('Data for target', dataSettings$targetId[i], 'exists at', file.path(saveDirectory, dataSettings$dataLocation[i]))) } } - # runPlp + # runDiagnosis - NEW if(!onlyFetchData){ - for(i in 1:nrow(settingstable)){ + for(i in 1:nrow(as.data.frame(settingstable))){ modelDesign <- modelDesignList[[i]] - settings <- settingstable[i,] + settings <- settingstable[i,] # just the data locations? dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0 if(dataExists){ - plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) + analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'diagnosePlp.rds')) - analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'plpResult', 'runPlp.rds')) if(!analysisExists){ + plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) + diagnosePlpSettings <- list( + plpData = plpData, + outcomeId = modelDesign$outcomeId, + analysisId = settings$analysisId, + populationSettings = modelDesign$populationSettings, + splitSettings = modelDesign$splitSettings, + sampleSettings = modelDesign$sampleSettings, + featureEngineeringSettings = modelDesign$featureEngineeringSettings, + preprocessSettings = modelDesign$preprocessSettings, + modelSettings = modelDesign$modelSettings, + logSettings = logSettings, + saveDirectory = saveDirectory + ) + diagnose <- tryCatch( + {do.call(diagnosePlp, diagnosePlpSettings)}, + error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + } else{ + ParallelLogger::logInfo(paste('Diagnosis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId))) + } + } + } + } + + # runPlp + if(!onlyFetchData){ + for(i in 1:nrow(as.data.frame(settingstable))){ + modelDesign <- modelDesignList[[i]] + settings <- settingstable[i,] # just the data locations? + + dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0 + + if(dataExists){ + analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'plpResult', 'runPlp.rds')) + + if(!analysisExists){ + plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) runPlpSettings <- list( - plpData = plpData, + plpData = quote(plpData), outcomeId = modelDesign$outcomeId, analysisId = settings$analysisId, populationSettings = modelDesign$populationSettings, - splitSettings = splitSettings, + splitSettings = modelDesign$splitSettings, sampleSettings = modelDesign$sampleSettings, featureEngineeringSettings = modelDesign$featureEngineeringSettings, preprocessSettings = modelDesign$preprocessSettings, @@ -159,10 +197,24 @@ runMultiplePlp <- function( ParallelLogger::logInfo(paste('Analysis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId))) } } - } - - + } # end run per setting + } + + # [TODO] add code to create sqlite database and populate with results... + if(!onlyFetchData){ + insertResultsToSqlite( + resultLocation = saveDirectory, + cohortDefinitions = cohortDefinitions, + databaseList = createDatabaseList( + cdmDatabaseSchemas = databaseDetails$cohortDatabaseSchema, + cdmDatabaseNames = databaseDetails$cdmDatabaseName, + databaseRefIds = databaseDetails$cdmDatabaseId + ), + sqliteLocation = sqliteLocation + ) } + + return(invisible(settingstable)) } @@ -181,6 +233,7 @@ runMultiplePlp <- function( #' @param sampleSettings Either NULL or an object of class \code{sampleSettings} with the over/under sampling settings used for model development #' @param preprocessSettings Either NULL or an object of class \code{preprocessSettings} created using \code{createPreprocessingSettings()} #' @param modelSettings The model settings such as \code{setLassoLogisticRegression()} +#' @param splitSettings The train/validation/test splitting used by all analyses created using \code{createDefaultSplitSetting()} #' @param runCovariateSummary Whether to run the covariateSummary #' #' @return @@ -197,6 +250,13 @@ createModelDesign <- function( sampleSettings = NULL, preprocessSettings = NULL, modelSettings = NULL, + splitSettings = createDefaultSplitSetting( + type = "stratified", + testFraction = 0.25, + trainFraction = 0.75, + splitSeed = 123, + nfold = 3 + ), runCovariateSummary = T ){ @@ -206,21 +266,30 @@ createModelDesign <- function( checkIsClass(populationSettings, c('populationSettings')) checkIsClass(restrictPlpDataSettings, 'restrictPlpDataSettings') checkIsClass(covariateSettings, c('covariateSettings', 'list')) + checkIsClass(splitSettings, 'splitSettings') useFE <- F if(!is.null(featureEngineeringSettings)){ - checkIsClass(featureEngineeringSettings, c('featureEngineeringSettings')) + if(inherits(featureEngineeringSettings, 'featureEngineeringSettings')){ + featureEngineeringSettings <- list(featureEngineeringSettings) + } + lapply(featureEngineeringSettings, function(x) checkIsClass(x, c('featureEngineeringSettings'))) useFE <- T } else{ - featureEngineeringSettings <- createFeatureEngineeringSettings(type = "none") + featureEngineeringSettings <- list(createFeatureEngineeringSettings(type = "none")) } useSample <- F if(!is.null(sampleSettings)){ - checkIsClass(sampleSettings, c('sampleSettings')) + + if(inherits(sampleSettings, 'sampleSettings')){ + sampleSettings <- list(sampleSettings) + } + lapply(sampleSettings, function(x) checkIsClass(x, c('sampleSettings'))) + useSample <- T } else{ - sampleSettings <- createSampleSettings(type = "none") + sampleSettings <- list(createSampleSettings(type = "none")) } usePreprocess <- F @@ -246,6 +315,7 @@ createModelDesign <- function( featureEngineeringSettings = featureEngineeringSettings, preprocessSettings = preprocessSettings, modelSettings = modelSettings, + splitSettings = splitSettings, executeSettings = createExecuteSettings( runSplitData = T, runSampleData = useSample, @@ -269,6 +339,7 @@ createModelDesign <- function( #' This function creates a json file with the modelDesignList saved #' #' @param modelDesignList A list of modelDesigns created using \code{createModelDesign()} +#' @param cohortDefinitions A list of the cohortDefinitions (generally extracted from ATLAS) #' @param saveDirectory The directory to save the modelDesignList settings #' #' @examples @@ -288,20 +359,17 @@ savePlpAnalysesJson <- function( createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression()) ), + cohortDefinitions = NULL, + # add cohortDefinitions saveDirectory = NULL ){ - if(class(modelDesignList) == 'modelDesign'){ + if(inherits(modelDesignList, 'modelDesign')){ modelDesignList <- list(modelDesignList) } lapply(modelDesignList, function(x){checkIsClass(x, 'modelDesign')}) - - # save this as a json - modelDesignList <- lapply(modelDesignList, function(x) prepareToJson(x)) - jsonSettings <- list(analyses = modelDesignList) # TODO: rename this ModelDesignList? - if(!is.null(saveDirectory)){ checkIsClass(saveDirectory, 'character') @@ -309,21 +377,27 @@ savePlpAnalysesJson <- function( dir.create(saveDirectory, recursive = T) } - modelDesignList <- jsonlite::toJSON( - x = jsonSettings, - pretty = T, - digits = 23, - auto_unbox=TRUE, - null = "null" - ) - write(modelDesignList, file.path(saveDirectory,"predictionAnalysisList.json")) - - # should we add splitSettings to this and the input? + ParallelLogger::saveSettingsToJson( + object = list( + plpVersion = as.character(utils::packageVersion("PatientLevelPrediction")), + analyses = modelDesignList, + cohortDefinitions = cohortDefinitions + ), + fileName = file.path(saveDirectory,"predictionAnalysisList.json") + ) return(file.path(saveDirectory,"predictionAnalysisList.json")) } - return(jsonSettings) + return( + ParallelLogger::convertSettingsToJson( + object = list( + plpVersion = as.character(utils::packageVersion("PatientLevelPrediction")), + analyses = modelDesignList, + cohortDefinitions = cohortDefinitions + ) + ) + ) } @@ -350,42 +424,9 @@ loadPlpAnalysesJson <- function( ParallelLogger::logError('Invalid directory - does not exist') } - if(!file.exists(file.path(jsonFileLocation))){ - ParallelLogger::logError('predictionAnalysisList.json not found ') - } - - - json <- tryCatch( - {readChar(jsonFileLocation, file.info(jsonFileLocation)$size)}, - error= function(cond) { - ParallelLogger::logInfo('Issue with loading json file...'); - ParallelLogger::logError(cond) - }) - json <- tryCatch( - {jsonlite::fromJSON(json, simplifyVector = T, simplifyDataFrame = F, simplifyMatrix = T)}, - error = function(cond) { - ParallelLogger::logInfo('Issue with parsing json object...'); - ParallelLogger::logError(cond) - }) - json$analyses <- tryCatch( - {lapply(json$analyses, function(x) prepareToRlist(x))}, - error = function(cond) { - ParallelLogger::logInfo('Issue converting json to R list...'); - ParallelLogger::logError(cond) - }) - - # if splitSettings in json - if('splitSettings' %in% names(json)){ - # update the splitsetting (move this into load/saveplpAnalysis) - if('attributes' %in% names(json$splitSettings)){ - atts <- json$splitSettings$attributes - json$splitSettings$attributes <- NULL - attributes(json$splitSettings) <- atts - } - } - - return(json) + rList <- ParallelLogger::loadSettingsFromJson(fileName = jsonFileLocation) + return(rList) } @@ -401,9 +442,10 @@ loadPlpAnalysesJson <- function( #' are found and the connection and database settings for the new data #' #' @param analysesLocation The location where the multiple plp analyses are -#' @param validationDatabaseDetails The validation database settings created using \code{createDatabaseDetails()} +#' @param validationDatabaseDetails A single or list of validation database settings created using \code{createDatabaseDetails()} #' @param validationRestrictPlpDataSettings The settings specifying the extra restriction settings when extracting the data created using \code{createRestrictPlpDataSettings()}. #' @param recalibrate A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration') +#' @param cohortDefinitions A list of cohortDefinitions #' @param saveDirectory The location to save to validation results #' #' @export @@ -412,13 +454,21 @@ validateMultiplePlp <- function( validationDatabaseDetails, validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), recalibrate = NULL, + cohortDefinitions = NULL, saveDirectory = NULL ){ # add input checks checkIsClass(analysesLocation, 'character') - checkIsClass(validationDatabaseDetails, 'databaseDetails') + if(inherits(validationDatabaseDetails, 'databaseDetails')){ + validationDatabaseDetails <- list(validationDatabaseDetails) + } + lapply( + validationDatabaseDetails, + function(x){checkIsClass(x, 'databaseDetails')} + ) + checkIsClass(validationRestrictPlpDataSettings, 'restrictPlpDataSettings') checkIsClass(recalibrate, c('character', 'NULL')) @@ -443,7 +493,7 @@ validateMultiplePlp <- function( ParallelLogger::logInfo(paste0('Evaluating model in ',modelSettings )) - if(dir.exists(file.path(modelSettings[i],'plpResult'))){ + if(dir.exists(file.path(modelSettings,'plpResult'))){ ParallelLogger::logInfo(paste0('plpResult found in ',modelSettings )) plpModel <- loadPlpModel(file.path(modelSettings,'plpResult','model')) @@ -455,7 +505,8 @@ validateMultiplePlp <- function( validationDatabaseDetails = validationDatabaseDetails, validationRestrictPlpDataSettings = validationRestrictPlpDataSettings, settings = createValidationSettings( - recalibrate = recalibrate + recalibrate = recalibrate, + runCovariateSummary = F ), outputFolder = saveLocation )}, @@ -466,167 +517,111 @@ validateMultiplePlp <- function( } } -} - - - - - - - - - - - - - - - -# HELPERS -#=============================== -getidList <- function(modelDesignList){ - - types <- c( - 'targetId', - 'outcomeId', - 'restrictPlpDataSettings', - 'covariateSettings', - 'populationSettings', - 'sampleSettings', - 'featureEngineeringSettings', - 'preprocessSettings', - 'modelSettings', - 'executeSettings' - ) - - idList <- list() - length(idList) <- length(types) - names(idList) <- types + # add to sqlite database - needed for shiny app + #======================= - for(type in types){ - idList[[type]] <- getSettingValues(modelDesignList, type = type ) - } - - return(idList) -} - - -getSettingValues <- function(modelDesignList, type = 'cohortId' ){ - - if(class(modelDesignList) == 'list'){ - values <- unique(unlist(lapply(modelDesignList, function(x)jsonlite::serializeJSON(x[[type]]))) - ) + if(saveLocation == file.path(analysesLocation, 'Validation')){ + ParallelLogger::logInfo('Saving validation results into the development sqlite database') + sqliteLocation <- file.path(analysesLocation, 'sqlite') } else{ - values <- jsonlite::serializeJSON(modelDesignList[[type]]) + ParallelLogger::logInfo('Saving validation results into validation sqlite') + sqliteLocation <- file.path(saveDirectory,'sqlite') } - if(! type %in% c('targetId', 'outcomeId') ){ - result <- data.frame( - value = values, - id = 1:length(values) - ) - } else{ - result <- data.frame( - value = sapply(values, function(x) jsonlite::unserializeJSON(x)), - id = sapply(values, function(x) jsonlite::unserializeJSON(x)) - ) + for(validationDatabaseDetail in validationDatabaseDetails){ + tryCatch({ + insertResultsToSqlite( + resultLocation = file.path(saveLocation, validationDatabaseDetail$cdmDatabaseName), + cohortDefinitions = cohortDefinitions, + databaseList = createDatabaseList( + cdmDatabaseSchemas = validationDatabaseDetail$cdmDatabaseSchema, + cdmDatabaseNames = validationDatabaseDetail$cdmDatabaseName, + databaseRefIds = validationDatabaseDetail$cdmDatabaseId + ), + sqliteLocation = sqliteLocation + ) + }) } - return(result) } -# get the ids for the model design settings -getIdsForSetting <- function(modelDesign, idList){ +convertToJson <-function( + modelDesignList, + cohortDefinitions = NULL +){ - ids <- c() + convertToJsonString <- function(x){as.character(ParallelLogger::convertSettingsToJson(x))} - for(settingType in names(idList)){ + if(is.null(cohortDefinitions)){ - if(!settingType %in% c('targetId', 'outcomeId')){ - # get the index of the setting matching the design setting - ind <- which(idList[[settingType]]$value == jsonlite::serializeJSON(modelDesign[[settingType]])) - } else{ - ind <- which(idList[[settingType]]$value == modelDesign[[settingType]]) - } - # get the id - id <- idList[[settingType]]$id[ind] + cohortIds <- unlist( + lapply( + X = 1:length(modelDesignList), + FUN = function(i){ + c( + modelDesignList[[i]]$targetId, + modelDesignList[[i]]$outcomeId + ) + } + ) + ) + cohortIds <- unique(cohortIds) - ids <- c(ids, id) + cohortDefinitions <- data.frame( + cohortId = cohortIds, + cohortName = paste0('Cohort: ', cohortIds) + ) + + } else{ + cohortDefinitions <- cohortDefinitions %>% + dplyr::select( + "cohortId", + "cohortName" + ) } - names(ids) <- names(idList) - - return(ids) -} - - -# this creates a data.frame with the analysisId and settingsId for each analysis -# need to add the data location to this -getSettingsTable <- function(modelDesignList, idList){ - - result <- lapply(modelDesignList, function(x) getIdsForSetting(x, idList) ) - settingsTable <- do.call(rbind, result) - settingsTable <- as.data.frame(settingsTable) - - settingsTable$analysisId <- paste0('Analysis_', 1:nrow(settingsTable)) - - settingsTable$dataLocation <- paste0('T_',settingsTable$targetId, '_L_', settingsTable$covariateSettings*settingsTable$restrictPlpDataSettings) + result <- data.frame( + analysisId = paste0('Analysis_', 1:length(modelDesignList)), + targetId = unlist(lapply(modelDesignList, function(x) ifelse(is.null(x$targetId), x$cohortId, x$targetId))), + outcomeId = unlist(lapply(modelDesignList, function(x) x$outcomeId)), + covariateSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$covariateSettings))), + restrictPlpDataSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$restrictPlpDataSettings))), + populationSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$populationSettings))), + sampleSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$sampleSettings))), + splitSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$splitSettings))), + featureEngineeringSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$featureEngineeringSettings))), + preprocessSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$preprocessSettings))), + modelSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$modelSettings))), + executeSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$executeSettings))) + ) - return(settingsTable) -} - - -getSettingFromId <- function( - idList, - type, - id -){ - ind <- which(idList[[type]]$id == id) - if(!type %in% c('targetId', 'outcomeId')){ - return(jsonlite::unserializeJSON(as.character(idList[[type]]$value[[ind]]))) - } else{ - return(idList[[type]]$value[[ind]]) - } -} - - -getDataSettings <- function(settingstable){ - - combos <- settingstable %>% - dplyr::distinct(.data$targetId,.data$covariateSettings,.data$restrictPlpDataSettings,.data$dataLocation) - - result <- list() - length(result) <- nrow(combos) - for(i in 1:nrow(combos)){ - result[[i]] <- list( - targetId = combos$targetId[i], - covariateSettings = combos$covariateSettings[i], - restrictPlpDataSettings = combos$restrictPlpDataSettings[i], - dataLocation = combos$dataLocation[i], - outcomeIds = settingstable %>% - dplyr::filter(.data$dataLocation == combos$dataLocation[i]) %>% - dplyr::select(.data$outcomeId) %>% - dplyr::pull() + result <- result %>% + dplyr::left_join(cohortDefinitions, by = c("outcomeId" = "cohortId")) %>% + dplyr::rename(outcomeName = "cohortName") %>% + dplyr::left_join(cohortDefinitions, by = c('targetId' = 'cohortId')) %>% + dplyr::rename(targetName = "cohortName") # new + + # get the names + uniqueSettings <- result %>% + dplyr::distinct( + .data$targetId, + .data$covariateSettings, + .data$restrictPlpDataSettings + ) %>% + dplyr::group_by(.data$targetId) %>% + dplyr::mutate(dataLocation = paste0('targetId_',.data$targetId, '_L', dplyr::row_number())) + + # add the data names + result <- result %>% + dplyr::left_join( + uniqueSettings, + by = c( + "targetId" = "targetId", + "covariateSettings" = "covariateSettings", + "restrictPlpDataSettings" = "restrictPlpDataSettings" + ) ) - } + return(result) } -getNames <- function( - cohortDefinitions, - ids -){ - - idNames <- lapply(cohortDefinitions, function(x) c(x$id, x$name)) - idNames <- do.call(rbind, idNames) - colnames(idNames) <- c('id', 'name') - idNames <- as.data.frame(idNames) - - nams <- c() - for(id in ids){ - nams <- c(nams, idNames$name[idNames$id == id]) - } - - return(nams) - -} diff --git a/R/RunPlp.R b/R/RunPlp.R index 40fe09247..2d7119388 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -31,7 +31,8 @@ #' develop and internally validate a model for the specified outcomeId. #' #' @param plpData An object of type \code{plpData} - the patient level prediction -#' data extracted from the CDM. +#' data extracted from the CDM. Can also include an initial population as +#' plpData$popualtion. #' @param outcomeId (integer) The ID of the outcome. #' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp. #' @param analysisName (character) Name for the analysis @@ -48,17 +49,12 @@ #' and whether to normalise the covariates before training #' @param modelSettings An object of class \code{modelSettings} created using one of the function: #' \itemize{ -#' \item{setLassoLogisticRegression()}{ A lasso logistic regression model} -#' \item{setGradientBoostingMachine()}{ A gradient boosting machine} -#' \item{setAdaBoost()}{ An ada boost model} -#' \item{setRandomForest()}{ A random forest model} -#' \item{setDecisionTree()}{ A decision tree model} -#' \item{setCovNN())}{ A convolutional neural network model} -#' \item{setCIReNN()}{ A recurrent neural network model} -#' \item{setMLP()}{ A neural network model} -#' \item{setDeepNN()}{ A deep neural network model} -#' \item{setKNN()}{ A KNN model} -#' +#' \item setLassoLogisticRegression() A lasso logistic regression model +#' \item setGradientBoostingMachine() A gradient boosting machine +#' \item setAdaBoost() An ada boost model +#' \item setRandomForest() A random forest model +#' \item setDecisionTree() A decision tree model +#' \item setKNN() A KNN model #' } #' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} #' specifying how the logging is done @@ -70,13 +66,12 @@ #' An object containing the following: #' #' \itemize{ -#' \item{inputSettings}{A list containing all the settings used to develop the model} -#' \item{model}{ The developed model of class \code{plpModel}} -#' \item{executionSummary}{ A list containing the hardward details, R package details and execution time} -#' \item{performanceEvaluation}{ Various internal performance metrics in sparse format} -#' \item{prediction}{ The plpData cohort table with the predicted risks added as a column (named value)} -#' \item{covariateSummary)}{ A characterization of the features for patients with and without the outcome during the time at risk} -#' \item{analysisRef}{ A list with details about the analysis} +#' \item model The developed model of class \code{plpModel} +#' \item executionSummary A list containing the hardward details, R package details and execution time +#' \item performanceEvaluation Various internal performance metrics in sparse format +#' \item prediction The plpData cohort table with the predicted risks added as a column (named value) +#' \item covariateSummary A characterization of the features for patients with and without the outcome during the time at risk +#' \item analysisRef A list with details about the analysis #' } #' #' @@ -258,7 +253,7 @@ runPlp <- function( tryCatch({ printHeader( plpData, - plpData$metaData$databaseDetails$cohortId, + plpData$metaData$databaseDetails$targetId, outcomeId, analysisId, analysisName, @@ -267,20 +262,23 @@ runPlp <- function( }) # create the population - population <- tryCatch( - { - do.call( - createStudyPopulation, - list( - plpData = plpData, - outcomeId = outcomeId, - populationSettings = populationSettings - ) - ) - }, + if(!is.null(plpData$population)) { + ParallelLogger::logInfo('Using existing population') + population <- plpData$population + } else { + ParallelLogger::logInfo('Creating population') + population <- tryCatch({ + do.call(createStudyPopulation, + list(plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettings, + population = plpData$population + ) + )}, error = function(e){ParallelLogger::logError(e); return(NULL)} - ) - + ) + } + if(is.null(population)){ stop('population NULL') } @@ -364,7 +362,8 @@ runPlp <- function( settings <- list( trainData = data$Train, modelSettings = modelSettings, - analysisId = analysisId + analysisId = analysisId, + analysisPath = analysisPath ) ParallelLogger::logInfo(sprintf('Training %s model',settings$modelSettings$name)) @@ -438,22 +437,29 @@ runPlp <- function( variableImportance <- plpData$covariateData$covariateRef %>% dplyr::mutate(covariateValue = 0) %>% - dplyr::select(.data$covariateId, .data$covariateValue) %>% + dplyr::select("covariateId", "covariateValue") %>% dplyr::collect() if(!is.null(model)){ if(!is.null(model$covariateImportance)){ - variableImportance <- model$covariateImportance %>% dplyr::select(.data$covariateId, .data$covariateValue) + variableImportance <- model$covariateImportance %>% + dplyr::select("covariateId", "covariateValue") } } + # apply FE if it is used + featureEngineering <- NULL + if(!is.null(model)){ + featureEngineering <- model$preprocessing$featureEngineering + } + covariateSummaryResult <- do.call(covariateSummary, list( covariateData = plpData$covariateData, - cohort = population %>% dplyr::select(.data$rowId), - labels = population %>% dplyr::select(.data$rowId, .data$outcomeCount), + cohort = population %>% dplyr::select("rowId"), + labels = population %>% dplyr::select("rowId", "outcomeCount"), strata = strata, variableImportance = variableImportance, - featureEngineering = NULL + featureEngineering = featureEngineering ) ) diff --git a/R/RunPlpHelpers.R b/R/RunPlpHelpers.R index 4073354c1..607e1872c 100644 --- a/R/RunPlpHelpers.R +++ b/R/RunPlpHelpers.R @@ -1,4 +1,4 @@ -printHeader <- function(plpData, cohortId, outcomeId , analysisId, analysisName, ExecutionDateTime){ +printHeader <- function(plpData, targetId, outcomeId , analysisId, analysisName, ExecutionDateTime){ ParallelLogger::logInfo(paste0('Patient-Level Prediction Package version ', utils::packageVersion("PatientLevelPrediction"))) @@ -8,9 +8,13 @@ printHeader <- function(plpData, cohortId, outcomeId , analysisId, analysisName, ParallelLogger::logInfo(sprintf('%-20s%s', 'AnalysisName: ',analysisName)) # add header to analysis log - ParallelLogger::logInfo(sprintf('%-20s%s', 'CohortID: ', cohortId)) + ParallelLogger::logInfo(sprintf('%-20s%s', 'TargetID: ', targetId)) ParallelLogger::logInfo(sprintf('%-20s%s', 'OutcomeID: ', outcomeId)) ParallelLogger::logInfo(sprintf('%-20s%s', 'Cohort size: ', nrow(plpData$cohorts))) + if(!is.null(plpData$population)){ + ParallelLogger::logInfo(sprintf('%-20s%s', 'Initial population size: ', nrow(plpData$population))) + ParallelLogger::logInfo(sprintf('%-20s%s', 'Initial cases: ', sum(plpData$population$outcomeCount>0))) + } ParallelLogger::logInfo(sprintf('%-20s%s', 'Covariates: ', nrow(plpData$covariateData$covariateRef))) ## ParallelLogger::logInfo(sprintf('%-20s%s', 'Population size: ', nrow(population))) ## ParallelLogger::logInfo(sprintf('%-20s%s', 'Cases: ', sum(population$outcomeCount>0))) @@ -34,13 +38,13 @@ checkInputs <- function(inputs) { ) # check class is correct - if(class(inputs[[inputName]]) != inputName && class(inputs[[inputName]]) != 'list'){ + if(!inherits(x = inputs[[inputName]], what = c(inputName,'list'))){ ParallelLogger::logError(paste0('Incorrect ', inputName)) stop('Bad input') } - if(class(inputs[[inputName]]) == 'list'){ - if(unique(unlist(lapply(inputs[[inputName]], class))) != inputName){ + if(inherits(x = inputs[[inputName]], what = 'list')){ + if(sum(unlist(lapply(inputs[[inputName]], function(obj){inherits(x = obj, what = inputName)}))) != length(inputs[[inputName]])){ ParallelLogger::logError(paste0('Incorrect ', inputName)) stop('Bad input list') } diff --git a/R/Sampling.R b/R/Sampling.R index a77f77431..f97b024e0 100644 --- a/R/Sampling.R +++ b/R/Sampling.R @@ -22,9 +22,9 @@ #' Returns an object of class \code{sampleSettings} that specifies the sampling function that will be called and the settings #' #' @param type (character) Choice of: \itemize{ -#' \item{'none'}{ No sampling is applied - this is the default } -#' \item{'underSample')}{Undersample the non-outcome class to make the data more ballanced} -#' \item{'overSample'}{Oversample the outcome class by adding in each outcome multiple times} +#' \item 'none' No sampling is applied - this is the default +#' \item 'underSample' Undersample the non-outcome class to make the data more ballanced +#' \item 'overSample' Oversample the outcome class by adding in each outcome multiple times #' } #' @param numberOutcomestoNonOutcomes (numeric) An numeric specifying the require number of non-outcomes per outcome #' @param sampleSeed (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated) @@ -46,7 +46,7 @@ createSampleSettings <- function(type = 'none', sampleSettings <- list( numberOutcomestoNonOutcomes = numberOutcomestoNonOutcomes, - sampleSeed = sampleSeed + sampleSeed = ifelse(type == 'none', 1, sampleSeed) # to make it the same for none ) if(type == 'none'){ @@ -71,7 +71,7 @@ sampleData <- function(trainData, sampleSettings){ ParallelLogger::logInfo('Starting data sampling') # if a single setting, make it a list - if(class(sampleSettings) == 'sampleSettings'){ + if(inherits(sampleSettings,'sampleSettings')){ sampleSettings <- list(sampleSettings) } @@ -85,7 +85,7 @@ sampleData <- function(trainData, sampleSettings){ ParallelLogger::logInfo('Finished data sampling') - metaData$sampleSettings <- sampleSetting + metaData$sampleSettings <- sampleSettings attr(trainData, "metaData") <- metaData return(trainData) @@ -95,6 +95,19 @@ sampleData <- function(trainData, sampleSettings){ sameData <- function(trainData, ...){ ParallelLogger::logInfo('No sampling - returning same data') + + # add attribute for FE + featureEngeering <- list( + funct = 'sameData', + settings = list( + none = T + ) + ) + attr(trainData, 'metaData')$featureEngineering = listAppend( + attr(trainData, 'metaData')$featureEngineering, + featureEngeering + ) + return(trainData) } @@ -138,6 +151,7 @@ underSampleData <- function(trainData, sampleSettings){ # filter to these patients sampleTrainData <- list() + class(sampleTrainData) <- 'plpData' sampleTrainData$labels <- trainData$labels %>% dplyr::filter(.data$rowId %in% pplOfInterest) sampleTrainData$folds <- trainData$folds %>% dplyr::filter(.data$rowId %in% pplOfInterest) @@ -179,6 +193,7 @@ overSampleData <- function(trainData, sampleSettings){ sum(population$outcomeCount == 0), ' non-outcomes')) sampleTrainData <- list() + class(sampleTrainData) <- 'plpData' sampleTrainData$labels <- trainData$labels %>% dplyr::collect() sampleTrainData$folds <- trainData$folds %>% dplyr::collect() @@ -238,5 +253,5 @@ overSampleData <- function(trainData, sampleSettings){ class(sampleTrainData$covariateData) <- 'CovariateData' - return(trainData) + return(sampleTrainData) } diff --git a/R/SaveLoadPlp.R b/R/SaveLoadPlp.R index 85f76a30c..47353fcf3 100644 --- a/R/SaveLoadPlp.R +++ b/R/SaveLoadPlp.R @@ -35,12 +35,15 @@ #' #' @export savePlpData <- function(plpData, file, envir=NULL, overwrite=F) { - if (missing(plpData)) + if (missing(plpData)){ stop("Must specify plpData") - if (missing(file)) + } + if (missing(file)){ stop("Must specify file") - if (!class(plpData) %in% c("plpData","plpData.libsvm" )) + } + if (!inherits(x = plpData, what = c("plpData"))){ stop("Data not of class plpData") + } if(dir.exists(file.path(file, "covariates"))){ stop('Folder to save covariates already exists...') } @@ -108,12 +111,15 @@ loadPlpData <- function(file, readOnly = TRUE) { #' #' @export savePlpModel <- function(plpModel, dirPath){ - if (missing(plpModel)) + if (missing(plpModel)){ stop("Must specify plpModel") - if (missing(dirPath)) + } + if (missing(dirPath)){ stop("Must specify directory path") - if (class(plpModel) != "plpModel") + } + if (!inherits(x = plpModel, what = "plpModel")){ stop("Not a plpModel") + } if(!dir.exists(dirPath)){ ParallelLogger::logInfo('Creating directory to save model') @@ -129,45 +135,82 @@ savePlpModel <- function(plpModel, dirPath){ # save the trainDetails if(!is.null(plpModel$trainDetails)){ - plpModel$trainDetails$trainingTime <- paste(as.character(plpModel$trainDetails$trainingTime), attr(plpModel$trainDetails$trainingTime,'units')) - saveJsonFile( - rObject = plpModel$trainDetails, - file = file.path(dirPath, 'trainDetails.json') + ParallelLogger::saveSettingsToJson( + object = plpModel$trainDetails, + fileName = file.path(dirPath, 'trainDetails.json') ) } # save the validationDetails if(!is.null(plpModel$validationDetails)){ - plpModel$validationDetails$validationDate <- paste(as.character(plpModel$validationDetails$validationDate), attr(plpModel$validationDetails$validationDate,'units')) - saveJsonFile( - rObject = plpModel$validationDetails, - file = file.path(dirPath, 'validationDetails.json') + ParallelLogger::saveSettingsToJson( + object = plpModel$validationDetails, + fileName = file.path(dirPath, 'validationDetails.json') ) } # save the settings - saveJsonFile( - rObject = plpModel$settings, - file = file.path(dirPath, 'settings.json') + ParallelLogger::saveSettingsToJson( + object = plpModel$modelDesign, + fileName = file.path(dirPath, 'modelDesign.json') + ) + + if(!is.null(plpModel$preprocessing)){ + + # cheap fix to get past bug in ParallelLogger::saveSettingsToJson with tibbles + plpModel$preprocessing$tidyCovariates$normFactors <- + as.data.frame(plpModel$preprocessing$tidyCovariates$normFactors) + + ParallelLogger::saveSettingsToJson( + object = plpModel$preprocessing, + fileName = file.path(dirPath, 'preprocessing.json') + ) + } + + + # save the model part function to file + saveModelPart( + model = plpModel$model, + savetype = attr(plpModel, 'saveType'), + dirPath = dirPath + ) + + # save the attributes of plpModel + modelAttributes <- attributes(plpModel) + modelAttributes$names <- NULL + ParallelLogger::saveSettingsToJson( + object = modelAttributes, + fileName = file.path(dirPath, 'attributes.json') ) + return(dirPath) +} + + +saveModelPart <- function(model, savetype, dirPath){ # save the model based on saveType - if(attr(plpModel, 'saveType') == "xgboost"){ - xgboost::xgb.save(model = plpModel$model, fname = file.path(dirPath, "model.json")) - } else if(attr(plpModel, 'saveType') == "RtoJson"){ - saveJsonFile( - rObject = plpModel$model, - file = file.path(dirPath, 'model.json') + if(savetype == "xgboost"){ + xgboost::xgb.save( + model = model, + fname = file.path(dirPath, "model.json") + ) + } else if(savetype == "lightgbm"){ + lightgbm::lgb.save(booster = model, + filename = file.path(dirPath, "model.json")) + } else if(savetype == "RtoJson"){ + ParallelLogger::saveSettingsToJson( + object = model, + fileName = file.path(dirPath, 'model.json') ) - } else if(attr(plpModel, 'saveType') == "file"){ + } else if(savetype == "file"){ # move the model into model if(!dir.exists(file.path(dirPath, 'model'))){ dir.create(file.path(dirPath, 'model'), recursive = T) } - for(file in dir(plpModel$model)){ + for(file in dir(model)){ file.copy( - file.path(plpModel$model,file), + file.path(model,file), file.path(dirPath,'model'), overwrite = TRUE, recursive = FALSE, @@ -178,15 +221,6 @@ savePlpModel <- function(plpModel, dirPath){ ParallelLogger::logWarn('Not sure how to save model - invalid saveType') } - # save the attributes of plpModel - modelAttributes <- attributes(plpModel) - modelAttributes$names <- NULL - saveJsonFile( - rObject = modelAttributes, - file = file.path(dirPath, 'attributes.json') - ) - - return(dirPath) } @@ -206,7 +240,7 @@ loadPlpModel <- function(dirPath) { plpModel <- list() modelAttributes <- tryCatch( - loadJsonFile(file.path(dirPath, 'attributes.json')), + ParallelLogger::loadSettingsFromJson(file.path(dirPath, 'attributes.json')), error = function(e){NULL} ) @@ -224,27 +258,46 @@ loadPlpModel <- function(dirPath) { if(file.exists(file.path(dirPath, "trainDetails.json"))){ plpModel$trainDetails <- tryCatch( - loadJsonFile(file.path(dirPath, "trainDetails.json")), + ParallelLogger::loadSettingsFromJson(file.path(dirPath, "trainDetails.json")), error = function(e){NULL} ) } if(file.exists(file.path(dirPath, "validationDetails.json"))){ plpModel$validationDetails <- tryCatch( - loadJsonFile(file.path(dirPath, "validationDetails.json")), + ParallelLogger::loadSettingsFromJson(file.path(dirPath, "validationDetails.json")), error = function(e){NULL} ) } - plpModel$settings <- tryCatch( - loadJsonFile(file.path(dirPath, "settings.json")), + plpModel$modelDesign <- tryCatch( + ParallelLogger::loadSettingsFromJson(file.path(dirPath, "modelDesign.json")), error = function(e){NULL} ) + # we don't use "preprocess" anymore, should be "preprocessing", + # but leave this here if loading an older model + if(file.exists(file.path(dirPath, "preprocess.json"))){ + plpModel$preprocessing <- tryCatch( + ParallelLogger::loadSettingsFromJson(file.path(dirPath, "preprocess.json")), + error = function(e){NULL} + ) + } + if(file.exists(file.path(dirPath, "preprocessing.json")) & is.null(plpModel$preprocessing)){ + plpModel$preprocessing <- tryCatch( + ParallelLogger::loadSettingsFromJson(file.path(dirPath, "preprocessing.json")), + error = function(e){NULL} + ) + } + + if(attr(plpModel, 'saveType') == "xgboost"){ ensure_installed("xgboost") plpModel$model <- xgboost::xgb.load(file.path(dirPath, "model.json")) + } else if(attr(plpModel, 'saveType') == "lightgbm"){ + ensure_installed("lightgbm") + plpModel$model <- lightgbm::lgb.load(file.path(dirPath, "model.json")) } else if(attr(plpModel, 'saveType') %in% c("RtoJson")){ - plpModel$model <- loadJsonFile(file.path(dirPath, "model.json")) + plpModel$model <- ParallelLogger::loadSettingsFromJson(file.path(dirPath, "model.json")) } else{ plpModel$model <- file.path(dirPath, 'model') } @@ -252,20 +305,6 @@ loadPlpModel <- function(dirPath) { return(plpModel) } -saveJsonFile <- function(rObject, file){ - - jsonObject <- jsonlite::serializeJSON(rObject, digits = 23) - write(jsonObject, file) -} - -loadJsonFile <- function(fileName) { - - jsonObject <- readChar(fileName, file.info(fileName)$size) - rObject <- jsonlite::unserializeJSON(jsonObject) - - return(rObject) -} - #' Saves the prediction dataframe to RDS #' @@ -279,7 +318,10 @@ loadJsonFile <- function(fileName) { #' @export savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){ #TODO check inupts - saveJsonFile(prediction, file=file.path(dirPath,fileName)) + ParallelLogger::saveSettingsToJson( + object = prediction, + fileName = file.path(dirPath,fileName) + ) return(file.path(dirPath,fileName)) } @@ -294,7 +336,7 @@ savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){ #' @export loadPrediction <- function(fileLocation){ #TODO check inupts - prediction <- loadJsonFile(fileName = fileLocation) + prediction <- ParallelLogger::loadSettingsFromJson(fileName = fileLocation) return(prediction) } @@ -308,14 +350,16 @@ loadPrediction <- function(fileLocation){ #' #' @export savePlpResult <- function(result, dirPath){ - if (missing(result)) + if (missing(result)){ stop("Must specify runPlp output") - if (missing(dirPath)) + } + if (missing(dirPath)){ stop("Must specify directory location") - #if (class(plpModel) != "plpModel") - # stop("Not a plpModel") - - if(!dir.exists(dirPath)) dir.create(dirPath, recursive = T) + } + + if(!dir.exists(dirPath)){ + dir.create(dirPath, recursive = T) + } savePlpModel(result$model, dirPath=file.path(dirPath,'model') ) result$model <- NULL @@ -332,16 +376,20 @@ savePlpResult <- function(result, dirPath){ #' #' @export loadPlpResult <- function(dirPath){ - if (!file.exists(dirPath)) + if (!file.exists(dirPath)){ stop(paste("Cannot find folder", dirPath)) - if (!file.info(dirPath)$isdir) + } + if (!file.info(dirPath)$isdir){ stop(paste("Not a folder", dirPath)) + } result <- readRDS(file.path(dirPath, "runPlp.rds")) result$model = loadPlpModel(file.path(dirPath, "model")) - - class(result) <- "runPlp" + if (is.null(class(result))) { + class(result) <- 'runPlp' + } + return(result) } @@ -362,7 +410,12 @@ savePlpShareable <- function(result, saveDirectory, minCellCount = 10){ if(!dir.exists(saveDirectory)) dir.create(saveDirectory, recursive = T) #executionSummary - saveJsonFile(result$executionSummary, file.path(saveDirectory, 'executionSummary.json')) + result$executionSummary$PackageVersion$packageVersion <- as.character(result$executionSummary$PackageVersion$packageVersion) + result$executionSummary$PlatformDetails$RAM <- as.character(result$executionSummary$PlatformDetails$RAM) + ParallelLogger::saveSettingsToJson( + object = result$executionSummary, + fileName = file.path(saveDirectory, 'executionSummary.json') + ) #save model as json files savePlpModel(result$model, file.path(saveDirectory, 'model')) @@ -399,7 +452,10 @@ savePlpShareable <- function(result, saveDirectory, minCellCount = 10){ } #analysisRef - saveJsonFile(result$analysisRef, file.path(saveDirectory, 'analysisRef.json')) + ParallelLogger::saveSettingsToJson( + object = result$analysisRef, + fileName = file.path(saveDirectory, 'analysisRef.json') + ) return(invisible(saveDirectory)) } @@ -445,7 +501,7 @@ loadPlpShareable <- function(loadDirectory){ result$model <- loadPlpModel(file.path(loadDirectory,'model')) #executionSummary - result$executionSummary <- tryCatch({loadJsonFile(fileName = file.path(loadDirectory, 'executionSummary.json'))}, error = function(e){return(NULL)}) + result$executionSummary <- tryCatch({ParallelLogger::loadSettingsFromJson(fileName = file.path(loadDirectory, 'executionSummary.json'))}, error = function(e){return(NULL)}) #performanceEvaluation result$performanceEvaluation <- list() @@ -459,7 +515,7 @@ loadPlpShareable <- function(loadDirectory){ result$covariateSummary <- utils::read.csv(file = file.path(loadDirectory,'covariateSummary.csv')) #analysisRef - result$analysisRef <- tryCatch({loadJsonFile(fileName = file.path(loadDirectory, 'analysisRef.json'))}, error = function(e){return(NULL)}) + result$analysisRef <- tryCatch({ParallelLogger::loadSettingsFromJson(fileName = file.path(loadDirectory, 'analysisRef.json'))}, error = function(e){return(NULL)}) class(result) <- "runPlp" return(result) @@ -497,3 +553,159 @@ removeCellCount <- function( return(data) } + +# add test for this - cant save json to csv - remove this... +#' Exports all the results from a database into csv files +#' +#' @details +#' Extracts the results from a database into a set of csv files +#' +#' @param conn The connection to the database with the results +#' @param connectionDetails The connectionDetails for the result database +#' @param databaseSchemaSettings The result database schema settings +#' @param csvFolder Location to save the csv files +#' @param minCellCount The min value to show in cells that are sensitive (values less than this value will be replaced with -1) +#' @param sensitiveColumns A named list (name of table columns belong to) with a list of columns to apply the minCellCount to. +#' @param fileAppend If set to a string this will be appended to the start of the csv file names +#' +#' @export +extractDatabaseToCsv <- function( + conn = NULL, + connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + csvFolder, + minCellCount = 5, + sensitiveColumns = getPlpSensitiveColumns(), + fileAppend = NULL + ){ + + ensure_installed('readr') + + # check inputs + if(!is.null(fileAppend)){ + fileAppend <- paste0(gsub('_','',gsub(' ','', fileAppend)), '_') + } + + if(is.null(conn)){ + # connect + conn <- DatabaseConnector::connect(connectionDetails) + on.exit(DatabaseConnector::disconnect(conn)) + } + + # create the folder to save the csv files + if(!dir.exists(csvFolder)){ + dir.create(csvFolder, recursive = T) + } + + # get the table names using the function in uploadToDatabase.R + tables <- getPlpResultTables() + + # extract result per table - give option to extract from different cohort/database tables? + modelLocations <- list() + for(table in tables){ + sql <- "select * from @resultSchema.@appendtotable@tablename" + sql <- SqlRender::render( + sql, + resultSchema = databaseSchemaSettings$resultSchema, + appendtotable = databaseSchemaSettings$stringAppendToResultSchemaTables, + tablename = table + ) + sql <- SqlRender::translate( + sql = sql, + targetDialect = databaseSchemaSettings$targetDialect, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema) + result <- DatabaseConnector::querySql(conn, sql) + + # get the model locations + if(table == 'MODELS'){ + modelLocations <- result$PLP_MODEL_FILE + } + + # lower case for consistency in sharing csv results + colnames(result) <- tolower(colnames(result)) + + # TODO: add min cell count filter here + if(tolower(table) %in% names(sensitiveColumns)){ + result <- applyMinCellCount( + tableName = table, + sensitiveColumns = sensitiveColumns, + result = result, + minCellCount = minCellCount + ) + } + + # save the results as a csv + readr::write_excel_csv( + x = result, + file = file.path(csvFolder, paste0(fileAppend,tolower(table),'.csv')) + ) + } + + + # load plpModels from database file and save into csv file + if(length(modelLocations)>0){ + if(!dir.exists(file.path(csvFolder, 'models'))){ + dir.create(file.path(csvFolder, 'models'), recursive = T) + } + for(modelLocation in modelLocations){ + modelLocAppend <- strsplit(x = modelLocation, split = '/')[[1]][length(strsplit(x = modelLocation, split = '/')[[1]])] + plpModel <- tryCatch( + { + PatientLevelPrediction::loadPlpModel(file.path(modelLocation)) + }, error = function(e){ParallelLogger::logInfo(e); return(NULL)} + ) + if(!is.null(plpModel)){ + PatientLevelPrediction::savePlpModel(plpModel, file.path(csvFolder, 'models', modelLocAppend)) + } + } + } + + return(invisible(NULL)) + +} + + +getPlpSensitiveColumns <- function(){ + + result <- list( + + prediction_distribution = list( + c('person_count') + ), + covariate_summary = list( + c('covariate_count'), + c('with_no_outcome_covariate_count', 'with_outcome_covariate_count') + ), + calibration_summary = list( + c('person_count_at_risk', 'person_count_with_outcome') + ), + + demographic_summary = list( + c('person_count_at_risk'), + c('person_count_with_outcome') + ) + + ) + + return(result) +} + + +applyMinCellCount <- function( + tableName, + sensitiveColumns, + result, + minCellCount +){ + + columnsToCensor <- sensitiveColumns[[tableName]] + + for(columns in columnsToCensor){ + rowInd <- apply(result[,columns, drop = F] < minCellCount, 1, sum) > 0 + if(sum(rowInd) > 0){ + result[rowInd , columns] <- -1 + } + } + + return(result) +} diff --git a/R/Simulation.R b/R/Simulation.R index 8157238e2..b1d115c14 100644 --- a/R/Simulation.R +++ b/R/Simulation.R @@ -120,7 +120,7 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) { class(covariateData) <- "CovariateData" writeLines("Generating cohorts") - cohorts <- data.frame(rowId = 1:n, subjectId = 2e+10 + (1:n), cohortId = 1) + cohorts <- data.frame(rowId = 1:n, subjectId = 2e+10 + (1:n), targetId = 1) breaks <- cumsum(plpDataSimulationProfile$timePrevalence) r <- stats::runif(n) cohorts$time <- as.numeric(as.character(cut(r, breaks = c(0, breaks), labels = names(breaks)))) @@ -135,7 +135,10 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) { writeLines("Generating outcomes") allOutcomes <- data.frame() for (i in 1:length(plpDataSimulationProfile$metaData$outcomeIds)) { - prediction <- predictCyclopsType(plpDataSimulationProfile$outcomeModels[[i]], + coefficients <- data.frame(betas=as.numeric(plpDataSimulationProfile$outcomeModels[[i]]), + covariateIds=names(plpDataSimulationProfile$outcomeModels[[i]]) + ) + prediction <- predictCyclopsType(coefficients, cohorts, covariateData, modelType = "poisson") @@ -152,28 +155,25 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) { covariateData$coefficients <- NULL # add indexes for covariate summary - RSQLite::dbExecute(covariateData, "CREATE INDEX covsum_rowId ON covariates(rowId)") - RSQLite::dbExecute(covariateData, "CREATE INDEX covsum_covariateId ON covariates(covariateId)") - - + Andromeda::createIndex(tbl = covariateData$covariates, columnNames = 'rowId', indexName = 'covsum_rowId') + Andromeda::createIndex(tbl = covariateData$covariates, columnNames = 'covariateId', indexName = 'covsum_covariateId') + # Remove rownames else they will be copied to the ffdf objects: metaData = list() metaData$databaseDetails <- list( - cdmDatabaseSchema = 'Profile', + cdmDatabaseSchema = 'CDM_SCHEMA', + cdmDatabaseName = "CDM_NAME", outcomeDatabaseSchema = NULL, cohortDatabaseSchema = NULL, connectionDetails = NULL, outcomeTable = NULL, cohortTable = NULL, cdmVersion = 5, - cohortId = 1, + targetId = 1, outcomeIds = c(2,3) ) - metaData$restrictPlpDataSettings <- list( - studyStartDate = NULL, - studyEndDate = NULL - ) + metaData$restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings() metaData$covariateSettings <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = T) @@ -181,13 +181,13 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) { targetCount=nrow(cohorts), uniquePeople=nrow(cohorts), outcomes=nrow(outcomes)) attr(cohorts, "metaData") <- list( - cohortId = 1, + targetId = 1, attrition = attrition ) attr(allOutcomes, "metaData") <- data.frame(outcomeIds = c(2,3)) - attr(covariateData, "metaData") <- list(populationSize = n) + attr(covariateData, "metaData") <- list(populationSize = n, cohortId = 1) result <- list(cohorts = cohorts, outcomes = allOutcomes, diff --git a/R/SklearnClassifier.R b/R/SklearnClassifier.R index 6db987a5a..850e7b130 100644 --- a/R/SklearnClassifier.R +++ b/R/SklearnClassifier.R @@ -1,4 +1,4 @@ -# @file PythonClassifier.R +# @file SklearnClassifier.R # # Copyright 2021 Observational Health Data Sciences and Informatics # @@ -16,15 +16,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -fitSklearn <- function( - trainData, - param, - search = "grid", - analysisId, - ...) { +fitSklearn <- function(trainData, + modelSettings, + search = "grid", + analysisId, + ...) { + param <- modelSettings$param # check covariate data - if(!FeatureExtraction::isCovariateData(trainData$covariateData)){stop("Needs correct covariateData")} + if (!FeatureExtraction::isCovariateData(trainData$covariateData)) { + stop("Needs correct covariateData") + } # get the settings from the param pySettings <- attr(param, 'settings') @@ -34,8 +36,9 @@ fitSklearn <- function( start <- Sys.time() - if(!is.null(trainData$folds)){ - trainData$labels <- merge(trainData$labels, trainData$fold, by = 'rowId') + if (!is.null(trainData$folds)) { + trainData$labels <- + merge(trainData$labels, trainData$fold, by = 'rowId') } # convert the data to a sparse R Matrix and then use reticulate to convert to python sparse @@ -48,15 +51,15 @@ fitSklearn <- function( # save the model to outLoc outLoc <- createTempModelLoc() - + # functions does CV and fits final models # returns: prediction (Train/CV), # finalParam (optimal hyper-parameters) - # variableImportance (final model) + # variableImportance (final model) # paramGridSearch list with performance and params for complete grid search # at the moment it uses AUC as performance but this could be modified to let user # specify the performance metric to optimise - cvResult <- do.call( + cvResult <- do.call( what = gridCvPython, args = list( matrixData = matrixData, @@ -64,58 +67,66 @@ fitSklearn <- function( seed = pySettings$seed, requiresDenseMatrix = pySettings$requiresDenseMatrix, modelName = pySettings$name, - pythonImport = pySettings$pythonImport, - pythonImportSecond = pySettings$pythonImportSecond, - pythonClassifier = pySettings$pythonClassifier, + pythonModule = pySettings$pythonModule, + pythonClass = pySettings$pythonClass, modelLocation = outLoc, paramSearch = param, - saveToJson = pySettings$saveToJson - ) + saveToJson = attr(param, 'saveToJson') ) + ) + + hyperSummary <- + do.call(rbind, + lapply(cvResult$paramGridSearch, function(x) + x$hyperSummary)) - hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary)) - prediction <- cvResult$prediction - + variableImportance <- cvResult$variableImportance variableImportance[is.na(variableImportance)] <- 0 incs <- rep(1, nrow(covariateRef)) covariateRef$included <- incs - covariateRef$covariateValue <- unlist(variableImportance) # check this is correct order + covariateRef$covariateValue <- + unlist(variableImportance) # check this is correct order comp <- start - Sys.time() result <- list( model = file.path(outLoc), + preprocessing = list( + featureEngineering = attr(trainData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix + ), + prediction = prediction, - settings = list( - plpDataSettings = attr(trainData, "metaData")$plpDataSettings, + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = attr(trainData, "metaData")$targetId, + outcomeId = attr(trainData, "metaData")$outcomeId, + restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, - modelSettings = list( - model = pySettings$name, - param = param, - finalModelParameters = cvResult$finalParam, - extraSettings = attr(param, 'settings') - ), + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + modelSettings = modelSettings, splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings ), trainDetails = list( analysisId = analysisId, - cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - outcomeId = attr(trainData, "metaData")$outcomeId, - cohortId = attr(trainData, "metaData")$cohortId, - attrition = attr(trainData, "metaData")$attrition, - trainingTime = comp, + analysisSource = '', + #TODO add from model + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, + attrition = attr(trainData, "metaData")$attrition, + trainingTime = paste(as.character(abs(comp)), attr(comp, 'units')), trainingDate = Sys.Date(), + modelName = pySettings$name, + finalModelParameters = cvResult$finalParam, hyperParamSearch = hyperSummary ), @@ -125,81 +136,80 @@ fitSklearn <- function( class(result) <- "plpModel" attr(result, "predictionFunction") <- "predictPythonSklearn" attr(result, "modelType") <- "binary" - attr(result, "saveType") <- attr(param, 'saveType') + attr(result, "saveType") <- + attr(param, 'saveType') # in save/load plp + attr(result, "saveToJson") <- + attr(param, 'saveToJson') # when saving in reticulate return(result) } -predictPythonSklearn <- function( - plpModel, - data, - cohort - ){ - - if(class(data) == 'plpData'){ +predictPythonSklearn <- function(plpModel, + data, + cohort) { + if (inherits(data, 'plpData')) { # convert matrixObjects <- toSparseM( - plpData = data, + plpData = data, cohort = cohort, - map = plpModel$covariateImportance %>% - dplyr::select(.data$columnId, .data$covariateId) + map = plpModel$covariateImportance %>% + dplyr::select("columnId", "covariateId") ) newData <- matrixObjects$dataMatrix cohort <- matrixObjects$labels - }else{ + } else{ newData <- data } - - # import - os <- reticulate::import('os') # load model - if(plpModel$settings$modelSettings$extraSettings$saveToJson){ - skljson <- reticulate::import('sklearn_json') - modelLocation <- reticulate::r_to_py(paste0(plpModel$model,"\\model.json")) - model <- skljson$from_json(modelLocation) + if (attr(plpModel, 'saveToJson')) { + modelLocation <- + reticulate::r_to_py(file.path(plpModel$model, "model.json")) + model <- sklearnFromJson(path = modelLocation) } else{ - joblib <- reticulate::import('joblib') - modelLocation <- reticulate::r_to_py(paste0(plpModel$model,"\\model.pkl")) - model <- joblib$load(os$path$join(modelLocation)) + os <- reticulate::import('os') + joblib <- reticulate::import('joblib', convert = FALSE) + modelLocation <- + reticulate::r_to_py(file.path(plpModel$model, "model.pkl")) + model <- joblib$load(os$path$join(modelLocation)) } + included <- + plpModel$covariateImportance$columnId[plpModel$covariateImportance$included > + 0] # does this include map? + pythonData <- reticulate::r_to_py(newData[, included, drop = F]) - included <- plpModel$covariateImportance$columnId[plpModel$covariateImportance$included>0] # does this include map? - pythonData <- reticulate::r_to_py(newData[,included, drop = F]) - # make dense if needed - if(plpModel$settings$requireDenseMatrix){ + if (plpModel$preprocessing$requireDenseMatrix) { pythonData <- pythonData$toarray() } cohort <- predictValues( - model = model, - data = pythonData, - cohort = cohort, + model = model, + data = pythonData, + cohort = cohort, type = attr(plpModel, 'modelType') ) - + return(cohort) } -predictValues <- function(model, data, cohort, type = 'binary'){ - +predictValues <- function(model, data, cohort, type = 'binary') { predictionValue <- model$predict_proba(data) - cohort$value <- predictionValue[,2] + cohort$value <- reticulate::py_to_r(predictionValue)[, 2] - cohort <- cohort %>% - dplyr::select(-.data$rowId) %>% - dplyr::rename(rowId = .data$originalRowId) + cohort <- cohort %>% + dplyr::select(-"rowId") %>% + dplyr::rename(rowId = "originalRowId") attr(cohort, "metaData")$modelType <- type return(cohort) } -checkPySettings <- function(settings){ +checkPySettings <- function(settings) { checkIsClass(settings$seed, c('numeric', 'integer')) ParallelLogger::logDebug(paste0('classifier seed: ', settings$seed)) @@ -209,40 +219,26 @@ checkPySettings <- function(settings){ checkIsClass(settings$name, c('character')) ParallelLogger::logDebug(paste0('name: ', settings$name)) - checkIsClass(settings$saveToJson, c('logical')) - ParallelLogger::logDebug(paste0('saveToJson: ', settings$saveToJson)) - - checkIsClass(settings$pythonImport, c('character')) - ParallelLogger::logDebug(paste0('pythonImport: ', settings$pythonImport)) - - if(!is.null(settings$pythonImportSecond)){ - checkIsClass(settings$pythonImportSecond, c('character')) - ParallelLogger::logDebug(paste0('pythonImportSecond: ', settings$pythonImportSecond)) - } else{ - ParallelLogger::logDebug(paste0('pythonImportSecond: NULL')) - } + checkIsClass(settings$pythonModule, c('character')) + ParallelLogger::logDebug(paste0('pythonModule: ', settings$pythonModule)) - checkIsClass(settings$pythonClassifier, c('character')) - ParallelLogger::logDebug(paste0('pythonClassifier: ', settings$pythonClassifier)) + checkIsClass(settings$pythonClass, c('character')) + ParallelLogger::logDebug(paste0('pythonClass: ', settings$pythonClass)) } -gridCvPython <- function( - matrixData, - labels, - seed, - requiresDenseMatrix, - modelName, - pythonImport, - pythonImportSecond, - pythonClassifier, - modelLocation, - paramSearch, - saveToJson - ) - { +gridCvPython <- function(matrixData, + labels, + seed, + requiresDenseMatrix, + modelName, + pythonModule, + pythonClass, + modelLocation, + paramSearch, + saveToJson) +{ + ParallelLogger::logInfo(paste0("Running CV for ", modelName, " model")) - ParallelLogger::logInfo(paste0("Rnning CV for ",modelName," model")) - np <- reticulate::import('numpy') os <- reticulate::import('os') sys <- reticulate::import('sys') @@ -250,58 +246,70 @@ gridCvPython <- function( scipy <- reticulate::import('scipy') joblib <- reticulate::import('joblib') - firstImport <- reticulate::import(pythonImport) - - if(!is.null(pythonImportSecond)){ - classifier <- firstImport[[pythonImportSecond]][[pythonClassifier]] - } else{ - classifier <- firstImport[[pythonClassifier]] - } + module <- reticulate::import(pythonModule, convert = FALSE) + classifier <- module[pythonClass] ########################################################################### gridSearchPredictons <- list() length(gridSearchPredictons) <- length(paramSearch) - for(gridId in 1:length(paramSearch)){ - + for (gridId in 1:length(paramSearch)) { # initiate prediction prediction <- c() fold <- labels$index ParallelLogger::logInfo(paste0('Max fold: ', max(fold))) - for( i in 1:max(fold)){ - - ParallelLogger::logInfo(paste0('Fold ',i)) + for (i in 1:max(fold)) { + ParallelLogger::logInfo(paste0('Fold ', i)) trainY <- reticulate::r_to_py(labels$outcomeCount[fold != i]) - trainX <- reticulate::r_to_py(matrixData[fold != i,]) - testX <- reticulate::r_to_py(matrixData[fold == i,]) + trainX <- reticulate::r_to_py(matrixData[fold != i, ]) + testX <- reticulate::r_to_py(matrixData[fold == i, ]) - if(requiresDenseMatrix){ + if (requiresDenseMatrix) { ParallelLogger::logInfo('Converting sparse martix to dense matrix (CV)') trainX <- trainX$toarray() testX <- testX$toarray() } - model <- fitPythonModel(classifier, paramSearch[[gridId]], seed, trainX, trainY, np, pythonClassifier) + model <- + fitPythonModel(classifier, + paramSearch[[gridId]], + seed, + trainX, + trainY, + np, + pythonClass) ParallelLogger::logInfo("Calculating predictions on left out fold set...") - prediction <- rbind(prediction, predictValues(model = model, data = testX, cohort = labels[fold == i,], type = 'binary')) + prediction <- + rbind( + prediction, + predictValues( + model = model, + data = testX, + cohort = labels[fold == i, ], + type = 'binary' + ) + ) } - gridSearchPredictons[[gridId]] <- list( - prediction = prediction, - param = paramSearch[[gridId]] - ) + gridSearchPredictons[[gridId]] <- list(prediction = prediction, + param = paramSearch[[gridId]]) } # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function) - paramGridSearch <- lapply(gridSearchPredictons, function(x){do.call(computeGridPerformance, x)}) # cvAUCmean, cvAUC, param + paramGridSearch <- + lapply(gridSearchPredictons, function(x) { + do.call(computeGridPerformance, x) + }) # cvAUCmean, cvAUC, param - optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance))) + optimalParamInd <- + which.max(unlist(lapply(paramGridSearch, function(x) + x$cvPerformance))) finalParam <- paramGridSearch[[optimalParamInd]]$param @@ -313,37 +321,53 @@ gridCvPython <- function( trainY <- reticulate::r_to_py(labels$outcomeCount) trainX <- reticulate::r_to_py(matrixData) - if(requiresDenseMatrix){ + if (requiresDenseMatrix) { ParallelLogger::logInfo('Converting sparse martix to dense matrix (final model)') trainX <- trainX$toarray() } - model <- fitPythonModel(classifier, finalParam , seed, trainX, trainY, np, pythonClassifier) + model <- + fitPythonModel(classifier, + finalParam , + seed, + trainX, + trainY, + np, + pythonClass) ParallelLogger::logInfo("Calculating predictions on all train data...") - prediction <- predictValues(model = model, data = trainX, cohort = labels, type = 'binary') + prediction <- + predictValues( + model = model, + data = trainX, + cohort = labels, + type = 'binary' + ) prediction$evaluationType <- 'Train' - prediction <- rbind( - prediction, - cvPrediction - ) + prediction <- rbind(prediction, + cvPrediction) # saving model - if(!dir.exists(file.path(modelLocation))){ + if (!dir.exists(file.path(modelLocation))) { dir.create(file.path(modelLocation), recursive = T) } - # joblib$dump(model, os$path$join(modelLocation,"model.pkl"), compress = T) - if(saveToJson){ - skljson <- reticulate::import('sklearn_json') - skljson$to_json(model = model, model_name = file.path(modelLocation,"model.json")) + if (saveToJson) { + sklearnToJson(model = model, + path = file.path(modelLocation, "model.json")) } else{ - joblib$dump(model, file.path(modelLocation,"model.pkl"), compress = T) + joblib$dump(model, file.path(modelLocation, "model.pkl"), compress = T) } # feature importance - variableImportance <- tryCatch({model$feature_importances_}, error = function(e){ParallelLogger::logInfo(e);return(rep(1,ncol(matrixData)))}) - + variableImportance <- + tryCatch({ + reticulate::py_to_r(model$feature_importances_) + }, error = function(e) { + ParallelLogger::logInfo(e) + return(rep(1, ncol(matrixData))) + }) + return( list( prediction = prediction, @@ -356,79 +380,114 @@ gridCvPython <- function( } -fitPythonModel <- function(classifier, param, seed, trainX, trainY, np, pythonClassifier){ - ParallelLogger::logInfo(paste0('data X dim: ', trainX$shape[0], 'x', trainX$shape[1])) - ParallelLogger::logInfo(paste0('data Y length: ', np$shape(trainY)[[1]], ' with ',np$sum(trainY), ' outcomes')) - - timeStart <- Sys.time() - - # print parameters - # convert NULL to NA values - paramString <- param - for(ind in 1:length(paramString)){ - if(is.null(paramString[[ind]])){ - paramString[[ind]] <- 'null' +fitPythonModel <- + function(classifier, + param, + seed, + trainX, + trainY, + np, + pythonClass) { + ParallelLogger::logInfo(paste0('data X dim: ', trainX$shape[0], 'x', trainX$shape[1])) + ParallelLogger::logInfo(paste0( + 'data Y length: ', + np$shape(trainY)[[1]], + ' with ', + np$sum(trainY), + ' outcomes' + )) + + timeStart <- Sys.time() + + # print parameters + # convert NULL to NA values + paramString <- param + for (ind in 1:length(paramString)) { + if (is.null(paramString[[ind]])) { + paramString[[ind]] <- 'null' + } } + ParallelLogger::logInfo(paste( + names(param), + unlist(paramString), + sep = ':', + collapse = ' ' + )) + + if (!is.null(param)) { + model <- + do.call(paste0(pythonClass, 'Inputs'), + list(classifier = classifier, param = param)) + } else{ + model <- classifier() + } + model <- model$fit(trainX, trainY) + timeEnd <- Sys.time() + + ParallelLogger::logInfo(paste0( + "Training model took (mins): ", + difftime(timeEnd, timeStart, units = 'mins') + )) + + return(model) } - ParallelLogger::logInfo(paste(names(param), unlist(paramString), sep = ':', collapse = ' ')) - - if(!is.null(param)){ - model <- do.call(paste0(pythonClassifier,'Inputs'), list(classifier = classifier, param = param)) - } else{ - model <- classifier() - } - model <- model$fit(trainX, trainY) - timeEnd <- Sys.time() - - ParallelLogger::logInfo(paste0("Training model took (mins): ",difftime(timeEnd, timeStart, units='mins') )) - - return(model) -} - -computeGridPerformance <- function(prediction, param, performanceFunct = 'computeAuc'){ - - performance <- do.call( - what = eval(parse(text = performanceFunct)), - args = list(prediction = prediction) - ) - - performanceFold <- c() - for(i in 1:max(prediction$index)){ - performanceFold <- c( - performanceFold, - do.call( - what = eval(parse(text = performanceFunct)), - args = list(prediction = prediction[prediction$index == i,]) +#' Computes grid performance with a specified performance function +#' +#' @param prediction a dataframe with predictions and outcomeCount per rowId +#' @param param a list of hyperparameters +#' @param performanceFunct a string specifying which performance function to use +#' . Default ``'compute_AUC'`` +#' @return A list with overview of the performance +#' @export +computeGridPerformance <- + function(prediction, param, performanceFunct = 'computeAuc') { + performance <- do.call(what = eval(parse(text = performanceFunct)), + args = list(prediction = prediction)) + + performanceFold <- c() + for (i in 1:max(prediction$index)) { + performanceFold <- c(performanceFold, + do.call( + what = eval(parse(text = performanceFunct)), + args = list(prediction = prediction[prediction$index == i, ]) + )) + } + + paramString <- param + for (ind in 1:length(paramString)) { + if (is.null(paramString[[ind]])) { + paramString[[ind]] <- 'null' + } + } + + #hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString)) + #names(hyperSummary) <- c( + # 'Metric', + # 'cvPerformance', + # paste0('cvPerformanceFold',1:length(performanceFold)), + # names(param) + #) + paramValues <- unlist(paramString) + names(paramValues) <- names(param) + + hyperSummary <- as.data.frame(c( + data.frame( + metric = performanceFunct, + fold = c("CV", as.character(1:length(performanceFold))), + value = c(performance, performanceFold) + ), + paramValues + )) + return( + list( + metric = performanceFunct, + cvPerformance = performance, + cvPerformancePerFold = performanceFold, + param = param, + hyperSummary = hyperSummary ) ) } - - paramString <- param - for(ind in 1:length(paramString)){ - if(is.null(paramString[[ind]])){ - paramString[[ind]] <- 'null' - } - } - - hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString)) - names(hyperSummary) <- c( - 'Metric', - 'cvPerformance', - paste0('cvPerformanceFold',1:length(performanceFold)), - names(param) - ) - - return( - list( - metric = performanceFunct, - cvPerformance = performance, - cvPerformancePerFold = performanceFold, - param = param, - hyperSummary = hyperSummary - ) - ) - -} diff --git a/R/SklearnClassifierHelpers.R b/R/SklearnClassifierHelpers.R index 134bfb2e3..0535d833e 100644 --- a/R/SklearnClassifierHelpers.R +++ b/R/SklearnClassifierHelpers.R @@ -1,21 +1,32 @@ -listCartesian <- function(allList){ - - sizes <- lapply(allList, function(x) 1:length(x)) - combinations <- expand.grid(sizes) - - result <- list() - length(result) <- nrow(combinations) - - for(i in 1:nrow(combinations)){ - tempList <- list() - for(j in 1:ncol(combinations)){ - tempList <- c(tempList, list(allList[[j]][[combinations[i,j]]])) - } - names(tempList) <- names(allList) - result[[i]] <- tempList - } - - return(result) -} +# @file SklearnClassifierHelpers.R +# +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#' Cartesian product +#' +#' Computes the Cartesian product of all the combinations of elements in a list +#' +#' @param allList a list of lists +#' @return A list with all possible combinations from the input list of lists +#' @export +listCartesian <- function(allList) { + combinations <- expand.grid(allList) + results <- lapply(seq_len(nrow(combinations)), + function(i) lapply(combinations, function(x) x[i][[1]])) + return(results) +} diff --git a/R/SklearnClassifierSettings.R b/R/SklearnClassifierSettings.R index 6a0934da6..1b6446b70 100644 --- a/R/SklearnClassifierSettings.R +++ b/R/SklearnClassifierSettings.R @@ -1,4 +1,4 @@ -# @file PythonClassifier.R +# @file SklearnClassifierSettings.R # # Copyright 2021 Observational Health Data Sciences and Informatics # @@ -17,54 +17,51 @@ # limitations under the License. #' Create setting for AdaBoost with python DecisionTreeClassifier base estimator -# @param baseEstimator (list) The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper classes_ and n_classes_ attributes. If NULL, then the base estimator is DecisionTreeClassifier initialized with max_depth=1. #' @param nEstimators (list) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. #' @param learningRate (list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters #' There is a trade-off between learningRate and nEstimators. -#' @param algorithm (list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. +#' @param algorithm (list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. #' @param seed A seed for the model #' #' @examples #' \dontrun{ -#' model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), +#' model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), #' algorithm = list('SAMME.R'), seed = sample(1000000,1) #' ) #' } #' @export -setAdaBoost <- function( - #baseEstimator = list(NULL), - nEstimators = list(10,50, 200), - learningRate = list(1, 0.5, 0.1), - algorithm = list('SAMME.R'), - seed = sample(1000000,1) -){ - +setAdaBoost <- function(nEstimators = list(10, 50, 200), + learningRate = list(1, 0.5, 0.1), + algorithm = list('SAMME.R'), + seed = sample(1000000, 1)) { checkIsClass(seed[[1]], c("numeric", "integer")) checkIsClass(nEstimators, 'list') checkIsClass(learningRate, 'list') checkIsClass(algorithm, 'list') - #lapply(1:length(baseEstimator), function(i) checkIsClass(maxDepth[[i]] , c("NULL"))) - - lapply(1:length(nEstimators), function(i) checkIsClass(nEstimators[[i]] , c("integer", "numeric"))) - lapply(1:length(nEstimators), function(i) checkHigher(nEstimators[[i]] , 0)) + lapply(1:length(nEstimators), function(i) + checkIsClass(nEstimators[[i]] , c("integer", "numeric"))) + lapply(1:length(nEstimators), function(i) + checkHigher(nEstimators[[i]] , 0)) - for(i in 1:length(nEstimators)){ - if(class(nEstimators[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(nEstimators)) { + if (inherits(x = nEstimators[[i]], what = c("numeric", "integer"))) { nEstimators[[i]] <- as.integer(nEstimators[[i]]) } } - lapply(1:length(learningRate), function(i) checkIsClass(learningRate[[i]] , c("numeric"))) - lapply(1:length(learningRate), function(i) checkHigher(learningRate[[i]] , 0)) + lapply(1:length(learningRate), function(i) + checkIsClass(learningRate[[i]] , c("numeric"))) + lapply(1:length(learningRate), function(i) + checkHigher(learningRate[[i]] , 0)) - lapply(1:length(algorithm), function(i) checkIsClass(algorithm[[i]] , c("character"))) + lapply(1:length(algorithm), function(i) + checkIsClass(algorithm[[i]] , c("character"))) # test python is available and the required dependancies are there: ##checkPython() paramGrid <- list( - baseEstimator = list(NULL), nEstimators = nEstimators, learningRate = learningRate, algorithm = algorithm, @@ -74,51 +71,48 @@ setAdaBoost <- function( param <- listCartesian(paramGrid) attr(param, 'settings') <- list( + modelType = 'adaBoost', seed = seed[[1]], - paramNames = names(paramGrid), #use this for logging params + paramNames = names(paramGrid), + #use this for logging params requiresDenseMatrix = F, - saveToJson = F, name = "AdaBoost", - pythonImport = 'sklearn', - pythonImportSecond = 'ensemble', - pythonClassifier = 'AdaBoostClassifier' + pythonModule = "sklearn.ensemble", + pythonClass = "AdaBoostClassifier" ) + attr(param, 'saveToJson') <- T attr(param, 'saveType') <- 'file' - result <- list( - fitFunction = "fitSklearn", - param = param - ) + result <- list(fitFunction = "fitSklearn", + param = param) class(result) <- "modelSettings" return(result) } -AdaBoostClassifierInputs <- function(classifier, param){ - +AdaBoostClassifierInputs <- function(classifier, param) { model <- classifier( - base_estimator = param[[which.max(names(param)=='baseEstimator')]], - n_estimators = param[[which.max(names(param)=='nEstimators')]], - learning_rate = param[[which.max(names(param)=='learningRate')]], - algorithm = param[[which.max(names(param)=='algorithm')]], - random_state = param[[which.max(names(param)=='seed')]] + n_estimators = param[[which.max(names(param) == 'nEstimators')]], + learning_rate = param[[which.max(names(param) == 'learningRate')]], + algorithm = param[[which.max(names(param) == 'algorithm')]], + random_state = param[[which.max(names(param) == 'seed')]] ) return(model) } -#' Create setting for the scikit-learn 1.0.1 DecisionTree with python +#' Create setting for the scikit-learn 1.0.1 DecisionTree with python #' @param criterion The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. #' @param splitter The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split. #' @param maxDepth (list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. #' @param minSamplesSplit The minimum number of samples required to split an internal node #' @param minSamplesLeaf The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. #' @param minWeightFractionLeaf The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided. -#' @param maxFeatures (list) The number of features to consider when looking for the best split (int/'auto'/NULL) +#' @param maxFeatures (list) The number of features to consider when looking for the best split (int/'sqrt'/NULL) #' @param maxLeafNodes (list) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. (int/NULL) -#' @param minImpurityDecrease Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. +#' @param minImpurityDecrease Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. #' @param classWeight (list) Weights associated with classes 'balance' or NULL #' @param seed The random state seed #' @@ -127,21 +121,20 @@ AdaBoostClassifierInputs <- function(classifier, param){ #' model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL ) #' } #' @export -setDecisionTree <- function( - criterion = list('gini'), - splitter = list('best'), - maxDepth = list(as.integer(4), as.integer(10), NULL), - minSamplesSplit = list(2, 10), - minSamplesLeaf = list(10, 50), - minWeightFractionLeaf = list(0), - maxFeatures = list(100,'auto', NULL), - maxLeafNodes = list(NULL), - minImpurityDecrease = list(10^-7), - classWeight = list(NULL, 'balanced'), - seed = sample(1000000,1) - ){ - if(!class(seed[[1]])%in%c('numeric', 'integer')) +setDecisionTree <- function(criterion = list('gini'), + splitter = list('best'), + maxDepth = list(as.integer(4), as.integer(10), NULL), + minSamplesSplit = list(2, 10), + minSamplesLeaf = list(10, 50), + minWeightFractionLeaf = list(0), + maxFeatures = list(100, 'sqrt', NULL), + maxLeafNodes = list(NULL), + minImpurityDecrease = list(10 ^ -7), + classWeight = list(NULL), + seed = sample(1000000, 1)) { + if (!inherits(x = seed[[1]], what = c('numeric', 'integer'))) { stop('Invalid seed') + } checkIsClass(criterion, 'list') checkIsClass(splitter, 'list') @@ -154,78 +147,115 @@ setDecisionTree <- function( checkIsClass(minImpurityDecrease, 'list') checkIsClass(classWeight, 'list') - lapply(1:length(criterion), function(i) checkIsClass(criterion[[i]] , 'character')) - lapply(1:length(splitter), function(i) checkIsClass(splitter[[i]] , 'character')) + lapply(1:length(criterion), function(i) + checkIsClass(criterion[[i]] , 'character')) + lapply(1:length(splitter), function(i) + checkIsClass(splitter[[i]] , 'character')) - lapply(1:length(criterion), function(i) {if(!criterion[[i]] %in% c('gini', 'entropy')){stop('Incorrect criterion')}}) - + lapply(1:length(criterion), + function(i) { + if (!criterion[[i]] %in% c('gini', 'entropy')) { + stop('Incorrect criterion') + } + }) - lapply(1:length(maxDepth), function(i) checkIsClass(maxDepth[[i]] , c("numeric","integer","NULL"))) - lapply(1:length(maxDepth), function(i) checkHigher(ifelse(is.null(maxDepth[[i]]),1,maxDepth[[i]]) , 0)) - for(i in 1:length(maxDepth)){ - if(class(maxDepth[[i]]) %in% c("numeric", "integer")){ + + lapply(1:length(maxDepth), function(i) + checkIsClass(maxDepth[[i]] , c("numeric", "integer", "NULL"))) + lapply(1:length(maxDepth), function(i) + checkHigher(ifelse(is.null(maxDepth[[i]]), 1, maxDepth[[i]]) , 0)) + for (i in 1:length(maxDepth)) { + if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) { maxDepth[[i]] <- as.integer(maxDepth[[i]]) } } - lapply(1:length(minSamplesSplit), function(i) checkIsClass(minSamplesSplit[[i]] , c("numeric", "integer","NULL"))) - lapply(1:length(minSamplesSplit), function(i) checkHigher(ifelse(is.null(minSamplesSplit[[i]]),1, minSamplesSplit[[i]]) , 0)) + lapply(1:length(minSamplesSplit), + function(i) + checkIsClass(minSamplesSplit[[i]] , c("numeric", "integer", "NULL"))) + lapply(1:length(minSamplesSplit), + function(i) + checkHigher(ifelse( + is.null(minSamplesSplit[[i]]), 1, minSamplesSplit[[i]] + ) , 0)) # convert to integer if >= 1 - for(i in 1:length(minSamplesSplit)){ - if(minSamplesSplit[[i]] >= 1){ + for (i in 1:length(minSamplesSplit)) { + if (minSamplesSplit[[i]] >= 1) { minSamplesSplit[[i]] <- as.integer(minSamplesSplit[[i]]) } } - - lapply(1:length(minSamplesLeaf), function(i) checkIsClass(minSamplesLeaf[[i]] , c("numeric", "integer"))) - lapply(1:length(minSamplesLeaf), function(i) checkHigher(minSamplesLeaf[[i]] , 0)) + + lapply(1:length(minSamplesLeaf), + function(i) + checkIsClass(minSamplesLeaf[[i]] , c("numeric", "integer"))) + lapply(1:length(minSamplesLeaf), + function(i) + checkHigher(minSamplesLeaf[[i]] , 0)) # convert to integer if >= 1 - for(i in 1:length(minSamplesLeaf)){ - if(minSamplesLeaf[[i]] >= 1){ + for (i in 1:length(minSamplesLeaf)) { + if (minSamplesLeaf[[i]] >= 1) { minSamplesLeaf[[i]] <- as.integer(minSamplesLeaf[[i]]) } } - - lapply(1:length(minWeightFractionLeaf), function(i) checkIsClass(minWeightFractionLeaf[[i]] , c("numeric"))) - lapply(1:length(minWeightFractionLeaf), function(i) checkHigherEqual(minWeightFractionLeaf[[i]] , 0)) - - lapply(1:length(maxFeatures), function(i) checkIsClass(maxFeatures[[i]] , c("numeric", "integer","character","NULL"))) - - for(i in 1:length(maxFeatures)){ - if(class(maxFeatures[[i]]) %in% c("numeric", "integer")){ + lapply(1:length(minWeightFractionLeaf), + function(i) + checkIsClass(minWeightFractionLeaf[[i]] , c("numeric"))) + lapply(1:length(minWeightFractionLeaf), + function(i) + checkHigherEqual(minWeightFractionLeaf[[i]] , 0)) + + lapply(1:length(maxFeatures), + function(i) + checkIsClass(maxFeatures[[i]] , c( + "numeric", "integer", "character", "NULL" + ))) + + for (i in 1:length(maxFeatures)) { + if (inherits(x = maxFeatures[[i]], what = c("numeric", "integer"))) { maxFeatures[[i]] <- as.integer(maxFeatures[[i]]) } } - lapply(1:length(maxLeafNodes), function(i) checkIsClass(maxLeafNodes[[i]], c("integer","NULL"))) - lapply(1:length(maxLeafNodes), function(i) checkHigher(ifelse(is.null(maxLeafNodes[[i]]),1,maxLeafNodes[[i]]) , 0)) - - for(i in 1:length(maxLeafNodes)){ - if(class(maxLeafNodes[[i]]) %in% c("numeric", "integer")){ + lapply(1:length(maxLeafNodes), + function(i) + checkIsClass(maxLeafNodes[[i]], c("integer", "NULL"))) + lapply(1:length(maxLeafNodes), + function(i) + checkHigher(ifelse( + is.null(maxLeafNodes[[i]]), 1, maxLeafNodes[[i]] + ) , 0)) + + for (i in 1:length(maxLeafNodes)) { + if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) { maxLeafNodes[[i]] <- as.integer(maxLeafNodes[[i]]) } } - lapply(1:length(minImpurityDecrease), function(i) checkIsClass(minImpurityDecrease[[i]] , c("numeric"))) - lapply(1:length(minImpurityDecrease), function(i) checkHigherEqual(minImpurityDecrease[[i]], 0)) - - lapply(1:length(classWeight), function(i) checkIsClass(classWeight[[i]] , c('character', 'NULL'))) - + lapply(1:length(minImpurityDecrease), + function(i) + checkIsClass(minImpurityDecrease[[i]] , c("numeric"))) + lapply(1:length(minImpurityDecrease), + function(i) + checkHigherEqual(minImpurityDecrease[[i]], 0)) + + lapply(1:length(classWeight), function(i) + checkIsClass(classWeight[[i]] , c('character', 'NULL'))) + # test python is available and the required dependancies are there: ##checkPython() # scikit-learn 1.0.1 inputs: - # criterion='gini', splitter='best', max_depth=None, min_samples_split=2, - # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, + # criterion='gini', splitter='best', max_depth=None, min_samples_split=2, + # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, # max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0 # must be correct order for python classifier as I can't find a way to do.call a named list - # using reticulate + # using reticulate paramGrid <- list( criterion = criterion, splitter = splitter, @@ -240,66 +270,64 @@ setDecisionTree <- function( classWeight = classWeight ) param <- listCartesian(paramGrid) - + attr(param, 'settings') <- list( + modelType = 'decisionTree', seed = seed[[1]], - paramNames = names(paramGrid), #use this for logging params + paramNames = names(paramGrid), + #use this for logging params requiresDenseMatrix = F, - saveToJson = T, name = "Decision Tree", - pythonImport = 'sklearn', - pythonImportSecond = 'tree', - pythonClassifier = 'DecisionTreeClassifier' + pythonModule = "sklearn.tree", + pythonClass = "DecisionTreeClassifier" ) + attr(param, 'saveToJson') <- T attr(param, 'saveType') <- 'file' - result <- list( - fitFunction = "fitSklearn", - param = param - ) + result <- list(fitFunction = "fitSklearn", + param = param) class(result) <- "modelSettings" return(result) } -DecisionTreeClassifierInputs <- function(classifier, param){ - -model <- classifier( - criterion = param[[which.max(names(param)=='criterion')]], - splitter = param[[which.max(names(param)=='splitter')]], - max_depth = param[[which.max(names(param)=='maxDepth')]], - min_samples_split = param[[which.max(names(param)=='minSamplesSplit')]], - min_samples_leaf = param[[which.max(names(param)=='minSamplesLeaf')]], - min_weight_fraction_leaf = param[[which.max(names(param)=='minWeightFractionLeaf')]], - max_features = param[[which.max(names(param)=='maxFeatures')]], - random_state = param[[which.max(names(param)=='seed')]], - max_leaf_nodes = param[[which.max(names(param)=='maxLeafNodes')]], - min_impurity_decrease = param[[which.max(names(param)=='minImpurityDecrease')]], - class_weight = param[[which.max(names(param)=='classWeight')]] -) - -return(model) +DecisionTreeClassifierInputs <- function(classifier, param) { + model <- classifier( + criterion = param[[which.max(names(param) == 'criterion')]], + splitter = param[[which.max(names(param) == 'splitter')]], + max_depth = param[[which.max(names(param) == 'maxDepth')]], + min_samples_split = param[[which.max(names(param) == 'minSamplesSplit')]], + min_samples_leaf = param[[which.max(names(param) == 'minSamplesLeaf')]], + min_weight_fraction_leaf = param[[which.max(names(param) == 'minWeightFractionLeaf')]], + max_features = param[[which.max(names(param) == 'maxFeatures')]], + random_state = param[[which.max(names(param) == 'seed')]], + max_leaf_nodes = param[[which.max(names(param) == 'maxLeafNodes')]], + min_impurity_decrease = param[[which.max(names(param) == 'minImpurityDecrease')]], + class_weight = param[[which.max(names(param) == 'classWeight')]] + ) + + return(model) } -#' Create setting for neural network model with python -#' +#' Create setting for neural network model with python +#' #' @param hiddenLayerSizes (list of vectors) The ith element represents the number of neurons in the ith hidden layer. #' @param activation (list) Activation function for the hidden layer. #' \itemize{ -#' \item{"identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x} -#' \item{"logistic": the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).} -#' \item{"tanh": the hyperbolic tan function, returns f(x) = tanh(x).} -#' \item{"relu": the rectified linear unit function, returns f(x) = max(0, x)} +#' \item "identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x +#' \item "logistic": the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)). +#' \item "tanh": the hyperbolic tan function, returns f(x) = tanh(x). +#' \item "relu": the rectified linear unit function, returns f(x) = max(0, x) #' } #' @param solver (list) The solver for weight optimization. (‘lbfgs’, ‘sgd’, ‘adam’) #' @param alpha (list) L2 penalty (regularization term) parameter. #' @param batchSize (list) Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch. When set to “auto”, batchSize=min(200, n_samples). -#' @param learningRate (list) Only used when solver='sgd' Learning rate schedule for weight updates.{‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’ +#' @param learningRate (list) Only used when solver='sgd' Learning rate schedule for weight updates. ‘constant’, ‘invscaling’, ‘adaptive’, default=’constant’ #' @param learningRateInit (list) Only used when solver=’sgd’ or ‘adam’. The initial learning rate used. It controls the step-size in updating the weights. -#' @param powerT (list) Only used when solver=’sgd’. The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’. +#' @param powerT (list) Only used when solver=’sgd’. The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’. #' @param maxIter (list) Maximum number of iterations. The solver iterates until convergence (determined by ‘tol’) or this number of iterations. For stochastic solvers (‘sgd’, ‘adam’), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps. #' @param shuffle (list) boolean: Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’. #' @param tol (list) Tolerance for the optimization. When the loss or score is not improving by at least tol for nIterNoChange consecutive iterations, unless learning_rate is set to ‘adaptive’, convergence is considered to be reached and training stops. @@ -312,40 +340,38 @@ return(model) #' @param beta2 (list) Exponential decay rate for estimates of second moment vector in adam, should be in 0 to 1. #' @param epsilon (list) Value for numerical stability in adam. #' @param nIterNoChange (list) Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’. -#' @param seed A seed for the model +#' @param seed A seed for the model #' #' @examples #' \dontrun{ #' model.mlp <- setMLP() #' } #' @export -setMLP <- function( - hiddenLayerSizes = list(c(100), c(20,4)), #must be integers - activation = list('relu'), - solver = list('adam'), - alpha = list(0.3,0.01,0.0001,0.000001), - batchSize = list('auto'), - learningRate = list('constant'), - learningRateInit = list(0.001), - powerT = list(0.5), - maxIter = list(200, 100), - shuffle = list(TRUE), - tol = list(0.0001), - warmStart = list(TRUE), - momentum = list(0.9), - nesterovsMomentum = list(TRUE), - earlyStopping = list(FALSE), - validationFraction = list(0.1), - beta1 = list(0.9), - beta2 = list(0.999), - epsilon = list(1,0.1,0.00000001), - nIterNoChange = list(10), - seed = sample(100000,1) - ){ - - checkIsClass(seed, c('numeric','integer')) +setMLP <- function(hiddenLayerSizes = list(c(100), c(20)), + #must be integers + activation = list('relu'), + solver = list('adam'), + alpha = list(0.3, 0.01, 0.0001, 0.000001), + batchSize = list('auto'), + learningRate = list('constant'), + learningRateInit = list(0.001), + powerT = list(0.5), + maxIter = list(200, 100), + shuffle = list(TRUE), + tol = list(0.0001), + warmStart = list(TRUE), + momentum = list(0.9), + nesterovsMomentum = list(TRUE), + earlyStopping = list(FALSE), + validationFraction = list(0.1), + beta1 = list(0.9), + beta2 = list(0.999), + epsilon = list(0.00000001), + nIterNoChange = list(10), + seed = sample(100000, 1)) { + checkIsClass(seed, c('numeric', 'integer')) checkIsClass(hiddenLayerSizes, c('list')) - checkIsClass(activation, c('list')) + checkIsClass(activation, c('list')) checkIsClass(solver, c('list')) checkIsClass(alpha, c('list')) checkIsClass(batchSize, c('list')) @@ -353,7 +379,7 @@ setMLP <- function( checkIsClass(learningRateInit, c('list')) checkIsClass(powerT, c('list')) checkIsClass(maxIter, c('list')) - checkIsClass(shuffle, c('list')) + checkIsClass(shuffle, c('list')) checkIsClass(tol, c('list')) checkIsClass(warmStart, c('list')) checkIsClass(momentum, c('list')) @@ -366,40 +392,40 @@ setMLP <- function( checkIsClass(nIterNoChange, c('list')) - for(i in 1:length(hiddenLayerSizes)){ - hiddenLayerSizes[[i]] <- as.integer(hiddenLayerSizes[[i]]) + for (i in 1:length(hiddenLayerSizes)) { + hiddenLayerSizes[[i]] <- as.integer(hiddenLayerSizes[[i]]) } - for(i in 1:length(batchSize)){ - if(class(batchSize[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(batchSize)) { + if (inherits(x = batchSize[[i]], what = c("numeric", "integer"))) { batchSize[[i]] <- as.integer(batchSize[[i]]) } } - for(i in 1:length(maxIter)){ - if(class(maxIter[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(maxIter)) { + if (inherits(x = maxIter[[i]], what = c("numeric", "integer"))) { maxIter[[i]] <- as.integer(maxIter[[i]]) } } - for(i in 1:length(nIterNoChange)){ - if(class(nIterNoChange[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(nIterNoChange)) { + if (inherits(x = nIterNoChange[[i]], what = c("numeric", "integer"))) { nIterNoChange[[i]] <- as.integer(nIterNoChange[[i]]) } } # add lapply for values... paramGrid <- list( - hiddenLayerSizes = hiddenLayerSizes, + hiddenLayerSizes = hiddenLayerSizes, activation = activation, solver = solver, - alpha = alpha, + alpha = alpha, batchSize = batchSize, learningRate = learningRate, learningRateInit = learningRateInit, powerT = powerT, - maxIter = maxIter, + maxIter = maxIter, shuffle = shuffle, seed = list(as.integer(seed[[1]])), tol = tol, @@ -409,61 +435,59 @@ setMLP <- function( nesterovsMomentum = nesterovsMomentum, earlyStopping = earlyStopping, validationFraction = validationFraction, - beta1 = beta1, - beta2 = beta2 , - epsilon = epsilon, + beta1 = beta1, + beta2 = beta2 , + epsilon = epsilon, nIterNoChange = nIterNoChange ) - + param <- listCartesian(paramGrid) attr(param, 'settings') <- list( + modelType = 'mlp', seed = seed[[1]], - paramNames = names(paramGrid), #use this for logging params + paramNames = names(paramGrid), + #use this for logging params requiresDenseMatrix = F, - saveToJson = F, # current bug in sklearn-json name = "Neural Network", - pythonImport = 'sklearn', - pythonImportSecond = 'neural_network', - pythonClassifier = 'MLPClassifier' + pythonModule = "sklearn.neural_network", + pythonClass = "MLPClassifier" ) + attr(param, 'saveToJson') <- T attr(param, 'saveType') <- 'file' - result <- list( - fitFunction = "fitSklearn", - param = param - ) + result <- list(fitFunction = "fitSklearn", + param = param) class(result) <- "modelSettings" return(result) } -MLPClassifierInputs <- function(classifier, param){ - +MLPClassifierInputs <- function(classifier, param) { model <- classifier( - hidden_layer_sizes = param[[which.max(names(param)=='hiddenLayerSizes')]], - activation = param[[which.max(names(param)=='activation')]], - solver = param[[which.max(names(param)=='solver')]], - alpha = param[[which.max(names(param)=='alpha')]], - batch_size = param[[which.max(names(param)=='batchSize')]], - learning_rate = param[[which.max(names(param)=='learningRate')]], - learning_rate_init = param[[which.max(names(param)=='learningRateInit')]], - power_t = param[[which.max(names(param)=='powerT')]], - max_iter = param[[which.max(names(param)=='maxIter')]], - shuffle = param[[which.max(names(param)=='shuffle')]], - random_state = param[[which.max(names(param)=='seed')]], - tol = param[[which.max(names(param)=='tol')]], + hidden_layer_sizes = param[[which.max(names(param) == 'hiddenLayerSizes')]], + activation = param[[which.max(names(param) == 'activation')]], + solver = param[[which.max(names(param) == 'solver')]], + alpha = param[[which.max(names(param) == 'alpha')]], + batch_size = param[[which.max(names(param) == 'batchSize')]], + learning_rate = param[[which.max(names(param) == 'learningRate')]], + learning_rate_init = param[[which.max(names(param) == 'learningRateInit')]], + power_t = param[[which.max(names(param) == 'powerT')]], + max_iter = param[[which.max(names(param) == 'maxIter')]], + shuffle = param[[which.max(names(param) == 'shuffle')]], + random_state = param[[which.max(names(param) == 'seed')]], + tol = param[[which.max(names(param) == 'tol')]], verbose = F, - warm_start = param[[which.max(names(param)=='warmStart')]], - momentum = param[[which.max(names(param)=='momentum')]], - nesterovs_momentum = param[[which.max(names(param)=='nesterovsMomentum')]], - early_stopping = param[[which.max(names(param)=='earlyStopping')]], - validation_fraction = param[[which.max(names(param)=='validationFraction')]], - beta_1 = param[[which.max(names(param)=='beta1')]], - beta_2 = param[[which.max(names(param)=='beta2')]], - epsilon = param[[which.max(names(param)=='epsilon')]], - n_iter_no_change = param[[which.max(names(param)=='nIterNoChange')]] + warm_start = param[[which.max(names(param) == 'warmStart')]], + momentum = param[[which.max(names(param) == 'momentum')]], + nesterovs_momentum = param[[which.max(names(param) == 'nesterovsMomentum')]], + early_stopping = param[[which.max(names(param) == 'earlyStopping')]], + validation_fraction = param[[which.max(names(param) == 'validationFraction')]], + beta_1 = param[[which.max(names(param) == 'beta1')]], + beta_2 = param[[which.max(names(param) == 'beta2')]], + epsilon = param[[which.max(names(param) == 'epsilon')]], + n_iter_no_change = param[[which.max(names(param) == 'nIterNoChange')]] ) return(model) @@ -471,44 +495,41 @@ MLPClassifierInputs <- function(classifier, param){ -#' Create setting for naive bayes model with python +#' Create setting for naive bayes model with python #' #' @examples #' \dontrun{ #' model.nb <- setNaiveBayes() #' } #' @export -setNaiveBayes <- function(){ - +setNaiveBayes <- function() { # test python is available and the required dependancies are there: ##checkPython() param <- list(none = 'true') attr(param, 'settings') <- list( + modelType = 'naiveBayes', seed = as.integer(0), - paramNames = c(), #use this for logging params + paramNames = c(), + #use this for logging params requiresDenseMatrix = T, - saveToJson = T, name = "Naive Bayes", - pythonImport = 'sklearn', - pythonImportSecond = 'naive_bayes', - pythonClassifier = 'GaussianNB' + pythonModule = "sklearn.naive_bayes", + pythonClass = "GaussianNB" ) + attr(param, 'saveToJson') <- T attr(param, 'saveType') <- 'file' - result <- list( - fitFunction = "fitSklearn", - param = param - ) + result <- list(fitFunction = "fitSklearn", + param = param) class(result) <- "modelSettings" return(result) } -GaussianNBInputs <- function(classifier, param){ - +GaussianNBInputs <- function(classifier, param) { model <- classifier() return(model) @@ -517,57 +538,53 @@ GaussianNBInputs <- function(classifier, param){ #' Create setting for random forest model with python (very fast) #' -#' @param ntrees (list) The number of trees to build +#' @param ntrees (list) The number of trees to build #' @param criterion (list) The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific. #' @param maxDepth (list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than minSamplesSplit samples. #' @param minSamplesSplit (list) The minimum number of samples required to split an internal node #' @param minSamplesLeaf (list) The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. #' @param minWeightFractionLeaf (list) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided. -#' @param mtries (list) The number of features to consider when looking for the best split: +#' @param mtries (list) The number of features to consider when looking for the best split: #' \itemize{ -#' \item{int}{then consider max_features features at each split.} -#' \item{float}{then max_features is a fraction and round(max_features * n_features) features are considered at each split} -#' \item{'auto'}{then max_features=sqrt(n_features)} -#' \item{'sqrt'}{then max_features=sqrt(n_features) (same as “auto”)} -#' \item{'log2'}{then max_features=log2(n_features).} -#' \item{NULL}{then max_features=n_features} +#' \item int then consider max_features features at each split. +#' \item float then max_features is a fraction and round(max_features * n_features) features are considered at each split +#' \item 'sqrt' then max_features=sqrt(n_features) +#' \item 'log2' then max_features=log2(n_features) +#' \item NULL then max_features=n_features #' } #' @param maxLeafNodes (list) Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. #' @param minImpurityDecrease (list) A node will be split if this split induces a decrease of the impurity greater than or equal to this value. #' @param bootstrap (list) Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree. #' @param maxSamples (list) If bootstrap is True, the number of samples to draw from X to train each base estimator. #' @param oobScore (list) Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True. -#' @param classWeight (list) Weights associated with classes. If not given, all classes are supposed to have weight one. {NULL, “balanced”, “balanced_subsample”} +#' @param classWeight (list) Weights associated with classes. If not given, all classes are supposed to have weight one. NULL, “balanced”, “balanced_subsample” #' @param nJobs The number of jobs to run in parallel. #' @param seed A seed when training the final model #' #' @examples #' \dontrun{ -#' model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), +#' model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), #' maxDepth=c(5,20)) -#' } +#' } #' @export -setRandomForest <- function( - ntrees = list(100,500), - criterion = list('gini'), - maxDepth = list(4,10,17), - minSamplesSplit = list(2,5), - minSamplesLeaf = list(1,10), - minWeightFractionLeaf = list(0), - mtries = list('auto', 'log2'), - maxLeafNodes = list(NULL), - minImpurityDecrease = list(0), - bootstrap = list(TRUE), - maxSamples = list(NULL, 0.9), - oobScore = list(FALSE), - nJobs = list(NULL), - classWeight = list('balanced_subsample', NULL), - seed = sample(100000,1) - ){ - - checkIsClass(seed, c('numeric','integer')) +setRandomForest <- function(ntrees = list(100, 500), + criterion = list('gini'), + maxDepth = list(4, 10, 17), + minSamplesSplit = list(2, 5), + minSamplesLeaf = list(1, 10), + minWeightFractionLeaf = list(0), + mtries = list('sqrt', 'log2'), + maxLeafNodes = list(NULL), + minImpurityDecrease = list(0), + bootstrap = list(TRUE), + maxSamples = list(NULL, 0.9), + oobScore = list(FALSE), + nJobs = list(NULL), + classWeight = list(NULL), + seed = sample(100000, 1)) { + checkIsClass(seed, c('numeric', 'integer')) checkIsClass(ntrees, c('list')) - checkIsClass(criterion, c('list')) + checkIsClass(criterion, c('list')) checkIsClass(maxDepth, c('list')) checkIsClass(minSamplesSplit, c('list')) checkIsClass(minSamplesLeaf, c('list')) @@ -575,51 +592,51 @@ setRandomForest <- function( checkIsClass(mtries, c('list')) checkIsClass(maxLeafNodes, c('list')) checkIsClass(minImpurityDecrease, c('list')) - checkIsClass(bootstrap, c('list')) + checkIsClass(bootstrap, c('list')) checkIsClass(maxSamples, c('list')) checkIsClass(oobScore, c('list')) checkIsClass(nJobs, c('list')) checkIsClass(classWeight, c('list')) # convert to integer when needed - for(i in 1:length(ntrees)){ - if(class(ntrees[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(ntrees)) { + if (inherits(x = ntrees[[i]], what = c("numeric", "integer"))) { ntrees[[i]] <- as.integer(ntrees[[i]]) } } - for(i in 1:length(maxDepth)){ - if(class(maxDepth[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(maxDepth)) { + if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) { maxDepth[[i]] <- as.integer(maxDepth[[i]]) } } - for(i in 1:length(minSamplesSplit)){ - if(minSamplesSplit[[i]]>=1){ + for (i in 1:length(minSamplesSplit)) { + if (minSamplesSplit[[i]] >= 1) { minSamplesSplit[[i]] <- as.integer(minSamplesSplit[[i]]) } } - for(i in 1:length(minSamplesLeaf)){ - if(minSamplesLeaf[[i]]>=1){ + for (i in 1:length(minSamplesLeaf)) { + if (minSamplesLeaf[[i]] >= 1) { minSamplesLeaf[[i]] <- as.integer(minSamplesLeaf[[i]]) } } - for(i in 1:length(maxLeafNodes)){ - if(class(maxLeafNodes[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(maxLeafNodes)) { + if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) { maxLeafNodes[[i]] <- as.integer(maxLeafNodes[[i]]) } } - for(i in 1:length(nJobs)){ - if(class(nJobs[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(nJobs)) { + if (inherits(x = nJobs[[i]], what = c("numeric", "integer"))) { nJobs[[i]] <- as.integer(nJobs[[i]]) } } - for(i in 1:length(maxSamples)){ - if(class(maxSamples[[i]]) %in% c("numeric", "integer")){ - if(maxSamples[[i]] >= 1){ + for (i in 1:length(maxSamples)) { + if (inherits(x = maxSamples[[i]], what = c("numeric", "integer"))) { + if (maxSamples[[i]] >= 1) { maxSamples[[i]] <- as.integer(maxSamples[[i]]) } } @@ -646,48 +663,46 @@ setRandomForest <- function( param <- listCartesian(paramGrid) attr(param, 'settings') <- list( + modelType = 'randomForest', seed = seed[[1]], - paramNames = names(paramGrid), #use this for logging params + paramNames = names(paramGrid), + #use this for logging params requiresDenseMatrix = F, - saveToJson = T, name = "Random forest", - pythonImport = 'sklearn', - pythonImportSecond = 'ensemble', - pythonClassifier = 'RandomForestClassifier' - ) + pythonModule = "sklearn.ensemble", + pythonClass = "RandomForestClassifier" + ) + attr(param, 'saveToJson') <- T attr(param, 'saveType') <- 'file' - result <- list( - fitFunction = "fitSklearn", - param = param - ) + result <- list(fitFunction = "fitSklearn", + param = param) class(result) <- "modelSettings" return(result) } -RandomForestClassifierInputs <- function(classifier, param){ - +RandomForestClassifierInputs <- function(classifier, param) { model <- classifier( - n_estimators = param[[which.max(names(param)=='ntrees')]], - criterion = param[[which.max(names(param)=='criterion')]], - max_depth = param[[which.max(names(param)=='maxDepth')]], - min_samples_split = param[[which.max(names(param)=='minSamplesSplit')]], - min_samples_leaf = param[[which.max(names(param)=='minSamplesLeaf')]], - min_weight_fraction_leaf = param[[which.max(names(param)=='minWeightFractionLeaf')]], - max_features = param[[which.max(names(param)=='mtries')]], - max_leaf_nodes = param[[which.max(names(param)=='maxLeafNodes')]], - min_impurity_decrease = param[[which.max(names(param)=='minImpurityDecrease')]], - bootstrap = param[[which.max(names(param)=='bootstrap')]], - max_samples = param[[which.max(names(param)=='maxSamples')]], - oob_score = param[[which.max(names(param)=='oobScore')]], - n_jobs = param[[which.max(names(param)=='nJobs')]], - random_state = param[[which.max(names(param)=='seed')]], - verbose = 0, + n_estimators = param[[which.max(names(param) == 'ntrees')]], + criterion = param[[which.max(names(param) == 'criterion')]], + max_depth = param[[which.max(names(param) == 'maxDepth')]], + min_samples_split = param[[which.max(names(param) == 'minSamplesSplit')]], + min_samples_leaf = param[[which.max(names(param) == 'minSamplesLeaf')]], + min_weight_fraction_leaf = param[[which.max(names(param) == 'minWeightFractionLeaf')]], + max_features = param[[which.max(names(param) == 'mtries')]], + max_leaf_nodes = param[[which.max(names(param) == 'maxLeafNodes')]], + min_impurity_decrease = param[[which.max(names(param) == 'minImpurityDecrease')]], + bootstrap = param[[which.max(names(param) == 'bootstrap')]], + max_samples = param[[which.max(names(param) == 'maxSamples')]], + oob_score = param[[which.max(names(param) == 'oobScore')]], + n_jobs = param[[which.max(names(param) == 'nJobs')]], + random_state = param[[which.max(names(param) == 'seed')]], + verbose = 0L, warm_start = F, - class_weight = param[[which.max(names(param)=='classWeight')]] + class_weight = param[[which.max(names(param) == 'classWeight')]] ) return(model) @@ -696,10 +711,10 @@ RandomForestClassifierInputs <- function(classifier, param){ #' Create setting for the python sklearn SVM (SVC function) -#' @param C (list) Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. +#' @param C (list) Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. #' @param kernel (list) Specifies the kernel type to be used in the algorithm. one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’. If none is given ‘rbf’ will be used. #' @param degree (list) degree of kernel function is significant only in poly, rbf, sigmoid -#' @param gamma (list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. {‘scale’, ‘auto’} or float, default=’scale’ +#' @param gamma (list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. ‘scale’, ‘auto’ or float, default=’scale’ #' @param coef0 (list) independent term in kernel function. It is only significant in poly/sigmoid. #' @param shrinking (list) whether to use the shrinking heuristic. #' @param tol (list) Tolerance for stopping criterion. @@ -712,24 +727,20 @@ RandomForestClassifierInputs <- function(classifier, param){ #' model.svm <- setSVM(kernel='rbf', seed = NULL) #' } #' @export -setSVM <- function( - C = list(1,0.9,2,0.1), - kernel = list('rbf'), - degree = list(1,3,5), - gamma = list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25), - coef0 = list(0.0), - shrinking = list(TRUE), - tol = list(0.001), - classWeight = list('balanced', NULL), - cacheSize = 500, - seed = sample(100000,1) - ){ - - - checkIsClass(seed, c('numeric','integer')) - checkIsClass(cacheSize, c('numeric','integer')) +setSVM <- function(C = list(1, 0.9, 2, 0.1), + kernel = list('rbf'), + degree = list(1, 3, 5), + gamma = list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25), + coef0 = list(0.0), + shrinking = list(TRUE), + tol = list(0.001), + classWeight = list(NULL), + cacheSize = 500, + seed = sample(100000, 1)) { + checkIsClass(seed, c('numeric', 'integer')) + checkIsClass(cacheSize, c('numeric', 'integer')) checkIsClass(C, c('list')) - checkIsClass(kernel, c('list')) + checkIsClass(kernel, c('list')) checkIsClass(degree, c('list')) checkIsClass(gamma, c('list')) checkIsClass(coef0, c('list')) @@ -737,13 +748,12 @@ setSVM <- function( checkIsClass(tol, c('list')) checkIsClass(classWeight, c('list')) - for(i in 1:length(degree)){ - if(class(degree[[i]]) %in% c("numeric", "integer")){ + for (i in 1:length(degree)) { + if (inherits(x = degree[[i]], what = c("numeric", "integer"))) { degree[[i]] <- as.integer(degree[[i]]) } } - paramGrid = list( C = C, kernel = kernel, @@ -758,48 +768,46 @@ setSVM <- function( ) param <- listCartesian(paramGrid) - + attr(param, 'settings') <- list( + modelType = 'svm', seed = seed[[1]], - paramNames = names(paramGrid), #use this for logging params + paramNames = names(paramGrid), + #use this for logging params requiresDenseMatrix = F, - saveToJson = T, name = "Support Vector Machine", - pythonImport = 'sklearn', - pythonImportSecond = 'svm', - pythonClassifier = 'SVC' - ) + pythonModule = "sklearn.svm", + pythonClass = "SVC" + ) + attr(param, 'saveToJson') <- T attr(param, 'saveType') <- 'file' - - result <- list( - fitFunction = "fitSklearn", - param = param - ) + + result <- list(fitFunction = "fitSklearn", + param = param) class(result) <- "modelSettings" return(result) } -SVCInputs <- function(classifier, param){ - +SVCInputs <- function(classifier, param) { model <- classifier( - C = param[[which.max(names(param)=='C')]], - kernel = param[[which.max(names(param)=='kernel')]], - degree = param[[which.max(names(param)=='degree')]], - gamma = param[[which.max(names(param)=='gamma')]], - coef0 = param[[which.max(names(param)=='coef0')]], - shrinking = param[[which.max(names(param)=='shrinking')]], + C = param[[which.max(names(param) == 'C')]], + kernel = param[[which.max(names(param) == 'kernel')]], + degree = param[[which.max(names(param) == 'degree')]], + gamma = param[[which.max(names(param) == 'gamma')]], + coef0 = param[[which.max(names(param) == 'coef0')]], + shrinking = param[[which.max(names(param) == 'shrinking')]], probability = T, - tol = param[[which.max(names(param)=='tol')]], - cache_size = param[[which.max(names(param)=='cacheSize')]], - class_weight = param[[which.max(names(param)=='classWeight')]], + tol = param[[which.max(names(param) == 'tol')]], + cache_size = param[[which.max(names(param) == 'cacheSize')]], + class_weight = param[[which.max(names(param) == 'classWeight')]], verbose = F, max_iter = as.integer(-1), decision_function_shape = 'ovr', break_ties = F, - random_state = param[[which.max(names(param)=='seed')]] + random_state = param[[which.max(names(param) == 'seed')]] ) return(model) diff --git a/R/SklearnToJson.R b/R/SklearnToJson.R new file mode 100644 index 000000000..2a2771338 --- /dev/null +++ b/R/SklearnToJson.R @@ -0,0 +1,443 @@ +# @file SklearnToJson.R +# +# Copyright 2023 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitatons under the License. +# +#' Saves sklearn python model object to json in path +#' @param model a fitted sklearn python model object +#' @param path path to the saved model file +#' @export +sklearnToJson <- function(model, path) { + py <- reticulate::import_builtins(convert=FALSE) + json <- reticulate::import("json", convert=FALSE) + if (inherits(model, "sklearn.tree._classes.DecisionTreeClassifier")) { + serializedModel <- serializeDecisionTree(model) + } else if (inherits(model, "sklearn.ensemble._forest.RandomForestClassifier")) { + serializedModel <- serializeRandomForest(model) + } else if (inherits(model, "sklearn.ensemble._weight_boosting.AdaBoostClassifier")) { + serializedModel <- serializeAdaboost(model) + } else if (inherits(model, "sklearn.naive_bayes.GaussianNB")) { + serializedModel <- serializeNaiveBayes(model) + } else if (inherits(model, "sklearn.neural_network._multilayer_perceptron.MLPClassifier")) { + serializedModel <- serializeMLP(model) + } else if (inherits(model, "sklearn.svm._classes.SVC" )) { + serializedModel <- serializeSVM(model) + } else { + stop("Unsupported model") + } + + with(py$open(path, "w"), as=file, { + json$dump(serializedModel, fp=file) + }) + return(invisible()) +} + +#' Loads sklearn python model from json +#' @param path path to the model json file +#' @export +sklearnFromJson <- function(path) { + py <- reticulate::import_builtins(convert=FALSE) + json <- reticulate::import("json", convert=FALSE) + with(py$open(path, "r"), as=file, { + model <- json$load(fp=file) + }) + if (reticulate::py_bool(model["meta"] == "decision-tree")) { + model <- deSerializeDecisionTree(model) + } else if (reticulate::py_bool(model["meta"] == "rf")) { + model <- deSerializeRandomForest(model) + } else if (reticulate::py_bool(model["meta"] == "adaboost")) { + model <- deSerializeAdaboost(model) + } else if (reticulate::py_bool(model["meta"] == "naive-bayes")) { + model <- deSerializeNaiveBayes(model) + } else if (reticulate::py_bool(model["meta"] == "mlp")) { + model <- deSerializeMlp(model) + } else if (reticulate::py_bool(model["meta"] == "svm")) { + model <- deSerializeSVM(model) + } else { + stop("Unsupported model") + } + return(model) +} + +serializeTree <- function(tree) { + serializedTree <- tree$`__getstate__`() + dtypes <- serializedTree["nodes"]$dtype + + serializedTree["nodes"] <- serializedTree["nodes"]$tolist() + serializedTree["values"] <- serializedTree["values"]$tolist() + + return(list(serializedTree, dtypes)) +} + +deSerializeTree <- function(tree_dict, nFeatures, nClasses, nOutputs) { + # TODO the below only works for tree_dict loaded from json, if not it + for (i in 0:(length(tree_dict["nodes"])-1)) { + reticulate::py_set_item(tree_dict["nodes"], i, + reticulate::tuple(reticulate::py_to_r(tree_dict["nodes"][i]))) + } + + names <- list("left_child", "right_child", "feature", "threshold", "impurity", "n_node_samples", "weighted_n_node_samples") + if (length(tree_dict["nodes"][0])==8) { + # model used sklearn>=1.3 which added a parameter + names[[8]] <- "missing_go_to_left" + } + + sklearn <- reticulate::import("sklearn") + np <- reticulate::import("numpy", convert = FALSE) + + tree_dict["nodes"] <- np$array(tree_dict["nodes"], + dtype=np$dtype(reticulate::dict( + names = names, + formats = tree_dict["nodes_dtype"] + ))) + tree_dict["values"] <- np$array(tree_dict["values"]) + + Tree <- sklearn$tree$`_tree`$Tree(nFeatures, + np$array(reticulate::tuple(nClasses), + dtype=np$intp), + nOutputs) + + Tree$`__setstate__`(tree_dict) + + return(Tree) +} + +serializeDecisionTree <- function(model) { + tree <- serializeTree(model$tree_) + dtypes <- tree[[2]] + tree <- tree[[1]] + py <- reticulate::import_builtins(convert=FALSE) + serialized_model <- reticulate::dict( + "meta" = "decision-tree", + "feature_importances_" = model$feature_importances_$tolist(), + "max_features_" = model$max_features_, + "n_classes_" = py$int(model$n_classes_), + "n_features_in_" = model$n_features_in_, + "n_outputs_" = model$n_outputs_, + "tree_" = tree, + "classes_" = model$classes_$tolist(), + "params" = model$get_params() + ) + + tree_dtypes <- list() + for (i in 0:(length(dtypes)-1)) { + tree_dtypes <- c(tree_dtypes, dtypes[[i]]$str) + } + + serialized_model["tree_"]["nodes_dtype"] <- tree_dtypes + return(serialized_model) +} + +deSerializeDecisionTree <- function(model_dict) { + np <- reticulate::import("numpy", convert=FALSE) + sklearn <- reticulate::import("sklearn", convert=FALSE) + deserialized_model <- do.call(sklearn$tree$DecisionTreeClassifier, + reticulate::py_to_r(model_dict["params"])) + + deserialized_model$classes_ <- np$array(model_dict["classes_"]) + deserialized_model$max_features_ <- model_dict["max_features_"] + deserialized_model$n_classes_ <- model_dict["n_classes_"] + deserialized_model$n_features_in <- model_dict["n_features_in_"] + deserialized_model$n_outputs_ <- model_dict["n_outputs_"] + + tree <- deSerializeTree(model_dict["tree_"], + model_dict["n_features_in_"], + model_dict["n_classes_"], + model_dict["n_outputs_"]) + deserialized_model$tree_ <- tree + + return(deserialized_model) +} + +serializeRandomForest <- function(model) { + estimators <- list() + for (i in 1:length(model$estimators_)) { + estimators <- c(estimators, serializeDecisionTree(model$estimators_[i - 1])) + } + + serialized_model <- reticulate::dict( + "meta" = "rf", + "max_depth" = model$max_depth, + "min_samples_split" = model$min_samples_split, + "min_samples_leaf" = model$min_samples_leaf, + "min_weight_fraction_leaf" = model$min_weight_fraction_leaf, + "max_features" = model$max_features, + "max_leaf_nodes" = model$max_leaf_nodes, + "min_impurity_decrease" = model$min_impurity_decrease, + "min_impurity_split" = model$min_samples_split, + "n_features_in_" = model$n_features_in_, + "n_outputs_" = model$n_outputs_, + "classes_" = model$classes_$tolist(), + "estimators_" = reticulate::r_to_py(estimators), + "params" = model$get_params(), + "n_classes_" = model$n_classes_) + + if (reticulate::py_bool(model$`__dict__`["oob_score_"] != reticulate::py_none())) { + serialized_model["oob_score_"] <- model$oob_score_ + serialized_model["oob_decision_function_"] <- model$oob_decision_function_$tolist() + } + + return(serialized_model) +} + +deSerializeRandomForest <- function(model_dict) { + np <- reticulate::import("numpy", convert=FALSE) + sklearn <- reticulate::import("sklearn", convert=FALSE) + model <- do.call(sklearn$ensemble$RandomForestClassifier, + reticulate::py_to_r(model_dict["params"])) + + estimators <- list() + for (i in 1:length(model_dict$estimators_)) { + estimators <- c(estimators, deSerializeDecisionTree(model_dict["estimators_"][i - 1])) + } + + model$estimators_ <- np$array(estimators) + + model$classes_ <- np$array(model_dict["classes_"]) + model$n_features_in_ <- model_dict["n_features_in_"] + model$n_outputs_ <- model_dict["n_outputs_"] + model$max_depth <- model_dict["max_depth"] + model$min_samples_split <- model_dict["min_samples_split"] + model$min_samples_leaf <- model_dict["min_samples_leaf"] + model$min_weight_fraction_leaf <- model_dict["min_weight_fraction_leaf"] + model$max_features <- model_dict["max_features"] + model$max_leaf_nodes <- model_dict["max_leaf_nodes"] + model$min_impurity_decrease <- model_dict["min_impurity_decrease"] + model$min_impurity_split <- model_dict["min_impurity_split"] + model$n_classes_ <- model_dict["n_classes_"] + + if (reticulate::py_bool(model_dict$oob_score_ != reticulate::py_none())){ + model$oob_score_ <- model_dict["oob_score_"] + model$oob_decision_function_ <- model_dict["oob_decision_function_"] + } + return(model) +} + +serializeAdaboost <- function(model) { + estimators <- list() + for (i in 1:length(model$estimators_)) { + estimators <- c(estimators, serializeDecisionTree(model$estimators_[i - 1])) + } + serialized_model <- reticulate::dict( + "meta" = "adaboost", + "estimators_" = reticulate::r_to_py(estimators), + "n_features_in_" = model$n_features_in_, + "n_classes_" = model$n_classes_, + "params" = model$get_params(), + "classes_" = model$classes_$tolist(), + "estimator_weights_" = model$estimator_weights_$tolist()) + + return(serialized_model) +} + +deSerializeAdaboost <- function(model_dict) { + np <- reticulate::import("numpy", convert=FALSE) + sklearn <- reticulate::import("sklearn", convert=FALSE) + model <- do.call(sklearn$ensemble$AdaBoostClassifier, + reticulate::py_to_r(model_dict["params"])) + estimators <- list() + for (i in 1:length(model_dict$estimators_)) { + estimators <- c(estimators, deSerializeDecisionTree(model_dict["estimators_"][i - 1])) + } + + model$estimators_ <- np$array(estimators) + model$classes_ <- np$array(model_dict["classes_"]) + model$n_features_in_ <- model_dict["n_features_in_"] + model$n_classes_ <- model_dict["n_classes_"] + model$estimator_weights_ <- np$array(model_dict["estimator_weights_"]) + + return(model) +} + +serializeNaiveBayes <- function(model) { + serialized_model = reticulate::dict( + "meta" = "naive-bayes", + "classes_" = model$classes_$tolist(), + "class_count_" = model$class_count_$tolist(), + "class_prior_" = model$class_prior_$tolist(), + "theta_" = model$theta_$tolist(), + "epsilon_" = model$epsilon_, + "params" = model$get_params(), + "var_" = model$var_$tolist() + ) + return(serialized_model) +} + +deSerializeNaiveBayes <- function(model_dict) { + sklearn <- reticulate::import("sklearn", convert=FALSE) + np <- reticulate::import("numpy", convert=FALSE) + model <- do.call(sklearn$naive_bayes$GaussianNB, + reticulate::py_to_r(model_dict["params"])) + + model$classes_ <- np$array(model_dict["classes_"]) + model$class_count_ <- np$array(model_dict["class_count_"]) + model$class_prior_ <- np$array(model_dict["class_prior_"]) + model$theta_ <- np$array(model_dict["theta_"]) + model$epsilon_ <- model_dict["epsilon_"] + model$var_ <- np$array(model_dict["var_"]) + + return(model) +} + +serializeMLP <- function(model) { + # TODO Check if length(intercepts_) is ever different from length(coefs_) + for (i in 0:(length(model$coefs_) - 1)) { + reticulate::py_set_item(model$coefs_, i, + model$coefs_[i]$tolist()) + reticulate::py_set_item(model$intercepts_, i, + model$intercepts_[i]$tolist()) + } + serialized_model <- reticulate::dict( + "meta" = "mlp", + "coefs_" = model$coefs_, + "loss_" = model$loss_, + "intercepts_" = model$intercepts_, + "n_iter_" = model$n_iter_, + "n_layers_" = model$n_layers_, + "n_outputs_" = model$n_outputs_, + "out_activation_" = model$out_activation_, + "params" = model$get_params(), + "classes_" = model$classes_$tolist() + ) + return(serialized_model) +} + +deSerializeMlp <- function(model_dict) { + sklearn <- reticulate::import("sklearn", convert=FALSE) + np <- reticulate::import("numpy", convert=FALSE) + + model <- do.call(sklearn$neural_network$MLPClassifier, + reticulate::py_to_r(model_dict["params"])) + for (i in 0:(length(model_dict["coefs_"]) - 1)) { + reticulate::py_set_item(model_dict["coefs_"], i, + np$array(model_dict["coefs_"][i])) + reticulate::py_set_item(model_dict["intercepts_"], i, + np$array(model_dict["intercepts_"][i])) + + } + model$coefs_ = model_dict["coefs_"] + model$loss_ = model_dict["loss_"] + model$intercepts_ = model_dict["intercepts_"] + model$n_iter_ = model_dict["n_iter_"] + model$n_layers_ = model_dict["n_layers_"] + model$n_outputs_ = model_dict["n_outputs_"] + model$out_activation_ = model_dict["out_activation_"] + model$classes_ = np$array(model_dict["classes_"]) + + return(model) +} + +serializeSVM <- function(model) { + serialized_model = reticulate::dict( + "meta" = "svm", + "class_weight_" = model$class_weight_$tolist(), + "classes_" = model$classes_$tolist(), + "support_" = model$support_$tolist(), + "n_support_" = model$n_support_$tolist(), + "intercept_" = model$intercept_$tolist(), + "probA_" = model$probA_$tolist(), + "probB_" = model$probB_$tolist(), + "_intercept_" = model$`_intercept_`$tolist(), + "shape_fit_" = model$shape_fit_, + "_gamma" = model$`_gamma`, + "params" = model$get_params() + ) + if (inherits(model$support_vectors_, "numpy.ndarray")) { + serialized_model["support_vectors_"] <- model$support_vectors_$tolist() + } else { + serialized_model["support_vectors_"] <- serializeCsrMatrix(model$support_vectors_) + } + + if (inherits(model$dual_coef_, "numpy.ndarray")) { + serialized_model["dual_coef_"] <- model$dual_coef_$tolist() + } else { + serialized_model["dual_coef_"] <- serializeCsrMatrix(model$dual_coef_) + } + + if (inherits(model$`_dual_coef_`, "numpy.ndarray")) { + serialized_model["_dual_coef_"] <- model$`_dual_coef_`$tolist() + } else { + serialized_model["_dual_coef_"] <- serializeCsrMatrix(model$`_dual_coef_`) + } + return(serialized_model) +} + +deSerializeSVM <- function(model_dict) { + sklearn <- reticulate::import("sklearn", convert=FALSE) + np <- reticulate::import("numpy", convert=FALSE) + model <- do.call(sklearn$svm$SVC, + reticulate::py_to_r(model_dict["params"])) + model$shape_fit_ <- model_dict$shape_fit_ + model$`_gamma`<- model_dict["_gamma"] + model$class_weight_ <- np$array(model_dict$class_weight_)$astype(np$float64) + model$classes_ <- np$array(model_dict["classes_"]) + model$support_ <- np$array(model_dict["support_"])$astype(np$int32) + model$`_n_support` <- np$array(model_dict["n_support_"])$astype(np$int32) + model$intercept_ <- np$array(model_dict["intercept_"])$astype(np$float64) + model$`_probA` <- np$array(model_dict["probA_"])$astype(np$float64) + model$`_probB` <- np$array(model_dict["probB_"])$astype(np$float64) + model$`_intercept_` <- np$array(model_dict["_intercept_"])$astype(np$float64) + + if (reticulate::py_bool((model_dict$support_vectors_["meta"] != reticulate::py_none())) & + (reticulate::py_bool(model_dict$support_vectors_["meta"] == "csr"))) { + model$support_vectors_ <- deSerializeCsrMatrix(model_dict$support_vectors_) + model$`_sparse` <- TRUE + } else { + model$support_vectors_ <- np$array(model_dict$support_vectors_)$astype(np$float64) + model$`_sparse` <- FALSE + } + if (reticulate::py_bool((model_dict$dual_coef_["meta"] != reticulate::py_none())) & + (reticulate::py_bool(model_dict$dual_coef_["meta"] == "csr"))) { + model$dual_coef_ <- deSerializeCsrMatrix(model_dict$dual_coef_) + } else { + model$dual_coef_ <- np$array(model_dict$dual_coef_)$astype(np$float64) + } + + if (reticulate::py_bool((model_dict$`_dual_coef_`["meta"] != reticulate::py_none())) & + (reticulate::py_bool(model_dict$`_dual_coef_`["meta"] == "csr"))) { + model$`_dual_coef_` <- deSerializeCsrMatrix(model_dict$`dual_coef_`) + } else { + model$`_dual_coef_` <- np$array(model_dict$`_dual_coef_`)$astype(np$float64) + } + return(model) +} + +serializeCsrMatrix <- function(csr_matrix) { + serialized_csr_matrix = reticulate::dict( + "meta" = "csr", + "indices" = csr_matrix$indices$tolist(), + "indptr" = csr_matrix$indptr$tolist(), + "_shape"= csr_matrix$`_shape`) + serialized_csr_matrix["data"] <- csr_matrix$data$tolist() + return(serialized_csr_matrix) +} + +deSerializeCsrMatrix <- function(csr_dict, + data_type=np$float64, + indices_type=np$int32, + indptr_type=np$int32) { + sp <- reticulate::import("scipy", convert=FALSE) + np <- reticulate::import("numpy", convert=FALSE) + csr_matrix <- sp$sparse$csr_matrix( + reticulate::tuple(list(np$array(csr_dict["data"])$astype(data_type), + np$array(csr_dict["indices"])$astype(indices_type), + np$array(csr_dict["indptr"])$astype(indptr_type))), + shape=csr_dict["shape"] + ) + return(csr_matrix) +} + + \ No newline at end of file diff --git a/R/ThresholdSummary.R b/R/ThresholdSummary.R index e5203f0d2..b36d86014 100644 --- a/R/ThresholdSummary.R +++ b/R/ThresholdSummary.R @@ -81,6 +81,14 @@ getThresholdSummary_binary <- function(prediction, evalColumn, ...){ # sort prediction predictionOfInterest <- predictionOfInterest[order(-predictionOfInterest$value),] + # because of numerical precision issues (I think), in very rare cases the preferenceScore + # is not monotonically decreasing after this sort (it should follow the predictions) + # as a fix I remove the troublesome row from influencing the thresholdSummary + if (!all(predictionOfInterest$preferenceScore == cummin(predictionOfInterest$preferenceScore))) { + troubleRow <- (which((predictionOfInterest$preferenceScore == cummin(predictionOfInterest$preferenceScore))==FALSE)) + predictionOfInterest <- predictionOfInterest[-troubleRow,] + } + # create indexes if(length(predictionOfInterest$preferenceScore)>100){ indexesOfInt <- c( @@ -166,6 +174,7 @@ getThresholdSummary_binary <- function(prediction, evalColumn, ...){ ) ) + } result <- as.data.frame(result) @@ -196,17 +205,19 @@ getThresholdSummary_survival <- function(prediction, evalColumn, timepoint, ...) ) nbSummary <- tryCatch( - { + { xstart <- max(min(preddat$p),0.001); + xstop <- min(max(preddat$p),0.99); stdca( data = preddat, outcome = "y", ttoutcome = "t", timepoint = timepoint, predictors = "p", - xstart = max(min(preddat$p),0.001), #0.001, - xstop = min(max(preddat$p),0.99), - xby = 0.001, - smooth=F + xstart = xstart, + xstop = xstop, + xby = (xstop - xstart)/100, + smooth = FALSE, + graph = FALSE ) }, error = function(e){ParallelLogger::logError(e); return(NULL)} @@ -249,7 +260,7 @@ stdca <- function (data, outcome, ttoutcome, timepoint, predictors, xstart = 0.0 1), outcome]) > 0) & cmprsk == FALSE) { stop("outcome must be coded as 0 and 1") } - if (class(data) != "data.frame") { + if (!inherits(x = data, what = "data.frame")) { stop("Input data must be class data.frame") } if (xstart < 0 | xstart > 1) { @@ -453,6 +464,37 @@ stdca <- function (data, outcome, ttoutcome, timepoint, predictors, xstart = 0.0 } +checkToByTwoTableInputs <- function(TP,FP,FN,TN){ + # check classes + if(!inherits(x = TP, what = c('integer','numeric'))){ + stop('Incorrect TP class') + } + if(!inherits(x = FP, what = c('integer','numeric'))){ + stop('Incorrect FP class') + } + if(!inherits(x = TN, what = c('integer','numeric'))){ + stop('Incorrect TN class') + } + if(!inherits(x = FN, what = c('integer','numeric'))){ + stop('Incorrect FN class') + } + + # check positive values + if(sum(TP<0)>0){ + stop('TP < 0') + } + if(sum(FP<0)>0){ + stop('FP < 0') + } + if(sum(TN<0)>0){ + stop('TN < 0') + } + if(sum(FN<0)>0){ + stop('FN < 0') + } + + return(invisible(TRUE)) +} # making all this single for easy unit testing #' Calculate the f1Score #' @@ -468,14 +510,13 @@ stdca <- function (data, outcome, ttoutcome, timepoint, predictors, xstart = 0.0 #' f1Score value #' f1Score <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + return(2*(TP/(TP+FP))*(TP/(TP+FN))/((TP/(TP+FP))+(TP/(TP+FN)))) } #' Calculate the accuracy @@ -492,15 +533,15 @@ f1Score <- function(TP,TN,FN,FP){ #' accuracy value #' accuracy <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - (TP+TN)/(TP+TN+FP+FN)} + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return((TP+TN)/(TP+TN+FP+FN)) +} #' Calculate the sensitivity #' @@ -516,15 +557,15 @@ accuracy <- function(TP,TN,FN,FP){ #' sensitivity value #' sensitivity <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - TP/(TP+FN)} + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(TP/(TP+FN)) +} #' Calculate the falseNegativeRate #' @@ -540,15 +581,15 @@ sensitivity <- function(TP,TN,FN,FP){ #' falseNegativeRate value #' falseNegativeRate <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - FN/(TP+FN)} + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(FN/(TP+FN)) +} #' Calculate the falsePositiveRate #' @@ -564,15 +605,15 @@ falseNegativeRate <- function(TP,TN,FN,FP){ #' falsePositiveRate value #' falsePositiveRate <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - FP/(FP+TN)} + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(FP/(FP+TN)) +} #' Calculate the specificity #' @@ -588,15 +629,16 @@ falsePositiveRate <- function(TP,TN,FN,FP){ #' specificity value #' specificity <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - TN/(FP+TN)} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(TN/(FP+TN)) +} #' Calculate the positivePredictiveValue #' @@ -612,15 +654,16 @@ specificity <- function(TP,TN,FN,FP){ #' positivePredictiveValue value #' positivePredictiveValue <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - TP/(TP+FP)} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(TP/(TP+FP)) +} #' Calculate the falseDiscoveryRate #' @@ -636,15 +679,16 @@ positivePredictiveValue <- function(TP,TN,FN,FP){ #' falseDiscoveryRate value #' falseDiscoveryRate <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - FP/(TP+FP)} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(FP/(TP+FP)) + } #' Calculate the negativePredictiveValue #' @@ -660,15 +704,16 @@ falseDiscoveryRate <- function(TP,TN,FN,FP){ #' negativePredictiveValue value #' negativePredictiveValue <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - TN/(FN+TN)} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(TN/(FN+TN)) +} #' Calculate the falseOmissionRate #' @@ -684,15 +729,16 @@ negativePredictiveValue <- function(TP,TN,FN,FP){ #' falseOmissionRate value #' falseOmissionRate <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - FN/(FN+TN)} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(FN/(FN+TN)) +} #' Calculate the positiveLikelihoodRatio #' @@ -708,15 +754,16 @@ falseOmissionRate <- function(TP,TN,FN,FP){ #' positiveLikelihoodRatio value #' positiveLikelihoodRatio <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - (TP/(TP+FN))/(FP/(FP+TN))} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return((TP/(TP+FN))/(FP/(FP+TN))) +} #' Calculate the negativeLikelihoodRatio #' @@ -732,15 +779,16 @@ positiveLikelihoodRatio <- function(TP,TN,FN,FP){ #' negativeLikelihoodRatio value #' negativeLikelihoodRatio <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - (FN/(TP+FN))/(TN/(FP+TN))} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return((FN/(TP+FN))/(TN/(FP+TN))) +} #' Calculate the diagnostic odds ratio @@ -757,12 +805,13 @@ negativeLikelihoodRatio <- function(TP,TN,FN,FP){ #' diagnosticOddsRatio value #' diagnosticOddsRatio <- function(TP,TN,FN,FP){ - if(sum(TP<0)>0) stop('TP < 0') - if(sum(FP<0)>0) stop('FP < 0') - if(sum(TN<0)>0) stop('TN < 0') - if(sum(FN<0)>0) stop('FN < 0') - if(class(TP)!='numeric') stop('Incorrect TP class') - if(class(FP)!='numeric') stop('Incorrect FP class') - if(class(TN)!='numeric') stop('Incorrect TN class') - if(class(FN)!='numeric') stop('Incorrect FN class') - ((TP/(TP+FN))/(FP/(FP+TN)))/((FN/(TP+FN))/(TN/(FP+TN)))} + + checkToByTwoTableInputs( + TP = TP, + FP = FP, + FN = FN, + TN = TN + ) + + return(((TP/(TP+FN))/(FP/(FP+TN)))/((FN/(TP+FN))/(TN/(FP+TN)))) +} diff --git a/R/ViewShinyPlp.R b/R/ViewShinyPlp.R index df613ceca..65d60b8b0 100644 --- a/R/ViewShinyPlp.R +++ b/R/ViewShinyPlp.R @@ -7,11 +7,28 @@ #' #' @export viewMultiplePlp <- function(analysesLocation){ - viewPlps(result = analysesLocation, - validation=NULL, - useDatabase = F, - usePlpObject = F, - useFileSystem = T) + + if(!file.exists(file.path(analysesLocation, 'sqlite', 'databaseFile.sqlite'))){ + stop('No database found') + } + + connectionDetailSettings <- list( + dbms = 'sqlite', + server = file.path(analysesLocation, 'sqlite', 'databaseFile.sqlite') + ) + + databaseSettings <- list( + connectionDetailSettings = connectionDetailSettings, + schema = 'main', + tablePrefix = '', + dbms = 'sqlite', + server = file.path(analysesLocation, 'sqlite', 'databaseFile.sqlite'), + user = NULL, + password = NULL, + port = NULL + ) + + viewPlps(databaseSettings) } #' viewPlp - Interactively view the performance and model settings @@ -22,17 +39,37 @@ viewMultiplePlp <- function(analysesLocation){ #' Either the result of runPlp and view the plots #' @param runPlp The output of runPlp() (an object of class 'runPlp') #' @param validatePlp The output of externalValidatePlp (on object of class 'validatePlp') +#' @param diagnosePlp The output of diagnosePlp() #' @return #' Opens a shiny app for interactively viewing the results #' #' @export +viewPlp <- function(runPlp, validatePlp = NULL, diagnosePlp = NULL) { + + server <- insertRunPlpToSqlite( + runPlp = runPlp, + externalValidatePlp = validatePlp, + diagnosePlp = diagnosePlp + ) + + connectionDetailSettings <- list( + dbms = 'sqlite', + server = server + ) + + databaseSettings <- list( + connectionDetailSettings = connectionDetailSettings, + schema = 'main', + tablePrefix = '', + dbms = 'sqlite', + server = server, + user = NULL, + password = NULL, + port = NULL + ) + + viewPlps(databaseSettings) -viewPlp <- function(runPlp, validatePlp = NULL) { - viewPlps(result = runPlp, - validation=validatePlp, - useDatabase = F, - usePlpObject = T, - useFileSystem = F) } @@ -50,26 +87,37 @@ viewPlp <- function(runPlp, validatePlp = NULL) { #' @param myTableAppend A string appended to the results tables (optional) #' #' @export -viewDatabaseResultPlp <- function(mySchema, myServer, myUser, myPassword, myDbms, myPort = NULL, myTableAppend){ - - ensure_installed('pool') - ensure_installed('DBI') - - Sys.setenv("shinydbSchema" = mySchema) - Sys.setenv("shinydbServer" = myServer) - Sys.setenv("shinydbUser" = myUser) - Sys.setenv("shinydbPw" = myPassword) - Sys.setenv("shinydbDbms" = myDbms) - if(!is.null(myPort)){ - Sys.setenv("shinydbPort" = myPort) - } - Sys.setenv("shinydbTableAppend" = myTableAppend) +viewDatabaseResultPlp <- function( + mySchema, + myServer, + myUser, + myPassword, + myDbms, + myPort = NULL, + myTableAppend + ){ + + connectionDetailSettings <- list( + dbms = myDbms, + server = myServer, + user = myUser, + password = myPassword, + port = myPort + ) + + databaseSettings <- list( + connectionDetailSettings = connectionDetailSettings, + schema = mySchema, + tablePrefix = myTableAppend, + dbms = myDbms, + server = myServer, + user = myUser, + password = myPassword, + port = myPort + ) + + viewPlps(databaseSettings) - viewPlps(result = NULL, - validation=NULL, - useDatabase = T, - usePlpObject = F, - useFileSystem = F) } @@ -77,49 +125,56 @@ viewDatabaseResultPlp <- function(mySchema, myServer, myUser, myPassword, myDbms # code for multiple and single together # one shiny app -viewPlps <- function(result, - validation=NULL, - useDatabase = NULL, - usePlpObject = NULL, - useFileSystem = NULL){ - ensure_installed("shiny") - ensure_installed("shinydashboard") - ensure_installed("shinycssloaders") - ensure_installed("DT") - ensure_installed("htmlwidgets") - ensure_installed("shinyWidgets") - ensure_installed("plotly") - - appDir <- system.file("shiny", "PLPViewer", package = "PatientLevelPrediction") - shinySettings <- list(result = result, - validation = validation, - useDatabase = useDatabase, - usePlpObject = usePlpObject, - useFileSystem = useFileSystem) - .GlobalEnv$shinySettings <- shinySettings - on.exit(rm(shinySettings, envir = .GlobalEnv)) - shiny::runApp(appDir) -} - - - -#' Launch the Diagnostics Explorer Shiny app -#' -#' @param dataFolder A folder where the exported zip files with the results are stored. -#' Zip files containing results from multiple databases can be placed in the same -#' folder. -#' @param launch.browser Should the app be launched in your default browser, or in a Shiny window. -#' Note: copying to clipboard will not work in a Shiny window. -#' -#' @details -#' Launches a Shiny app that allows the user to explore the diagnostics -#' -#' @export -launchDiagnosticsExplorer <- function(dataFolder, launch.browser = FALSE) { - ensure_installed("DT") - appDir <- system.file("shiny", "DiagnosticsExplorer", package = "PatientLevelPrediction") - shinySettings <- list(dataFolder = dataFolder) - .GlobalEnv$shinySettings <- shinySettings - on.exit(rm(shinySettings, envir = .GlobalEnv)) - shiny::runApp(appDir) -} +viewPlps <- function(databaseSettings){ + ensure_installed("ShinyAppBuilder") + ensure_installed("ResultModelManager") + + connectionDetails <- do.call( + DatabaseConnector::createConnectionDetails, + databaseSettings$connectionDetailSettings + ) + connection <- ResultModelManager::ConnectionHandler$new(connectionDetails) + databaseSettings$connectionDetailSettings <- NULL + + shinyAppVersion <- strsplit(x = as.character(utils::packageVersion('ShinyAppBuilder')), split = '\\.')[[1]] + + if((shinyAppVersion[1] <= 1 & shinyAppVersion[2] < 2)){ + # Old code to be backwards compatable + config <- ParallelLogger::loadSettingsFromJson( + fileName = system.file( + 'shinyConfig.json', + package = "PatientLevelPrediction" + ) + ) + # set database settings into system variables + Sys.setenv("resultDatabaseDetails_prediction" = as.character(ParallelLogger::convertSettingsToJson(databaseSettings))) + ShinyAppBuilder::viewShiny( + config = config, + connection = connection + ) + } else{ + ohdsiModulesVersion <- strsplit(x = as.character(utils::packageVersion('OhdsiShinyModules')), split = '\\.')[[1]] + if(paste0(ohdsiModulesVersion[1], ".", ohdsiModulesVersion[2])>= 1.2){ + config <- ParallelLogger::loadSettingsFromJson( + fileName = system.file( + 'shinyConfigUpdate.json', + package = "PatientLevelPrediction" + ) + ) + databaseSettings$plpTablePrefix = databaseSettings$tablePrefix + databaseSettings$cgTablePrefix = databaseSettings$tablePrefix + databaseSettings$databaseTable = 'database_meta_table' + databaseSettings$databaseTablePrefix = databaseSettings$tablePrefix + ShinyAppBuilder::viewShiny( + config = config, + connection = connection, + resultDatabaseSettings = databaseSettings + ) + } else{ + ParallelLogger::logWarn('Need to update package OhdsiShinyModules') + } + + } + + +} \ No newline at end of file diff --git a/R/uploadPlpDbResults.R b/R/uploadPlpDbResults.R deleted file mode 100644 index 175e10848..000000000 --- a/R/uploadPlpDbResults.R +++ /dev/null @@ -1,2841 +0,0 @@ -# @file UploadPlpDbResults.R -# -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Create the results tables to store PatientLevelPrediction models and results into a database -#' @description -#' This function executes a large set of SQL statements to create tables that can store models and results -#' -#' @details -#' This function can be used to create (or delete) PatientLevelPrediction result tables -#' -#' @param conn A connection to a database created by using the -#' function \code{connect} in the -#' \code{DatabaseConnector} package. -#' @param resultSchema The name of the database schema that the result tables will be created. -#' @param targetDialect The database management system being used -#' @param deleteExistingTables If true any existing tables matching the PatientLevelPrediction result tables names will be deleted -#' @param createTables If true the PatientLevelPrediction result tables will be created -#' @param stringAppendToTables A string that appends to the PatientLevelPrediction result tables -#' @param tempEmulationSchema The temp schema used when the database management system is oracle -#' @param testFile (used for testing) The location of an sql file with the table creation code -#' -#' @return -#' Returns NULL but creates the required tables into the specified database schema. -#' -#' @export -createPlpResultTables <- function(conn, - resultSchema, - targetDialect = 'postgresql', - deleteExistingTables = T, - createTables = T, - stringAppendToTables = '', - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - testFile = NULL){ - - - if(deleteExistingTables){ - ParallelLogger::logInfo('Deleting existing tables') - - tables <- c( - "CALIBRATION_SUMMARY", - "COVARIATE_SUMMARY", - "DEMOGRAPHIC_SUMMARY", - "EVALUATION_STATISTICS", - "PREDICTION_DISTRIBUTION", - "THRESHOLD_SUMMARY", - - "ATTRITION", #new - - "DIAGNOSTICS", #new - "RECALIBRATIONS", #new - - "RESULTS", - - "STUDY_MODELS", - - "MODELS", - - "MODEL_DESIGNS", - - "MODEL_SETTINGS", - "COVARIATE_SETTINGS", - "POPULATION_SETTINGS", - "FEATURE_ENGINEERING_SETTINGS", - "SPLIT_SETTINGS", - "PLP_DATA_SETTINGS", #new - "SAMPLE_SETTINGS", - "TIDY_COVARIATES_SETTINGS", #new - "TARS", - - "STUDIES", - "COHORTS", - "DATABASE_DETAILS", - "RESEARCHERS" - ) - - if(stringAppendToTables != ''){ - tables <- paste0(toupper(gsub('_','',gsub(' ','', stringAppendToTables))), '_', tables) - } - - alltables <- DatabaseConnector::getTableNames(connection = conn, - databaseSchema = resultSchema) - - for(tb in tables){ - if(tb %in%alltables){ - sql <- 'TRUNCATE TABLE @my_schema.@table' - sql <- SqlRender::render(sql, - my_schema = resultSchema, - table=tb) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - - sql <- 'DROP TABLE @my_schema.@table' - sql <- SqlRender::render(sql, - my_schema = resultSchema, - table=tb) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - } - - } - - if(createTables){ - ParallelLogger::logInfo('Creating PLP results tables') - - if(stringAppendToTables != ''){ - stringAppendToTables <- paste0(toupper(gsub('_','',gsub(' ','', stringAppendToTables))), '_') - } - - if(is.null(testFile)){ - renderedSql <- SqlRender::loadRenderTranslateSql(sqlFilename = "PlpResultTables.sql", - packageName = "PatientLevelPrediction", - dbms = targetDialect, - tempEmulationSchema = tempEmulationSchema, - my_schema = resultSchema, - string_to_append = stringAppendToTables - ) - } else { - sql <- readChar(testFile, file.info(testFile)$size) - renderedSql <- SqlRender::render(sql = sql[1], - my_schema = resultSchema, - string_to_append = stringAppendToTables) - renderedSql <- SqlRender::translate(sql = renderedSql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - - } - - DatabaseConnector::executeSql(conn, renderedSql) - } - -} - -# could add cohortDatabaseSchema and cohortTable as inputs below, plus database table - -#' Populate the PatientLevelPrediction results tables -#' @description -#' This function formats and uploads results that have been generated via an ATLAS prediction package into a database -#' -#' @details -#' This function can be used upload PatientLevelPrediction results into a database -#' -#' @param conn A connection to a database created by using the -#' function \code{connect} in the -#' \code{DatabaseConnector} package. -#' @param resultSchema (string) The name of the database schema with the result tables. -#' @param stringAppendToTables (string) A string that appends to the PatientLevelPrediction result tables -#' @param targetDialect (string) The database management system being used -#' @param tempEmulationSchema (string) The temp schema used when the database management system is oracle -#' @param packageName (string) The name of the ATLAS R package used to generate the results (this is used to extract cohort jsons) -#' @param studyJsonList (list) A list of lists per cohort with the cohort_name, cohort_id and cohort_json -#' @param studyName (string) A reference study name -#' @param studyDescription (string) A description of the study -#' @param researcherName (string) Name of the researcher who developed the study -#' @param researcherEmail (string) Email of the researcher who developed the study -#' @param researcherOrg (string) Organisation of the researcher who developed the study -#' @param databaseName (string) name of the database used to develop the model/s -#' @param databaseAcronym (string) acronym of the database used to develop the model/s -#' @param databaseVersion (int) Version of the database used to develop the model/s -#' @param databaseDescription (string) Description of the database used to develop the model/s -#' @param databaseType (string) Type of the database used to develop the model/s (e.g., claims) -#' @param valDatabases (list) A named list with details of the external validation databases. Needs to contain: name, description, version, type. -#' @param resultLocation (string) location of directory where the main package results were saved -#' @param resultPattern (string) A string to match to select models of interest -#' @param validationLocation (string) location of directory where the validation package results were saved -#' @param addInternalValidation (boolean) Whether the internval validation results should be uploaded -#' @param addExternalValidation (boolean) Whether the externval validation results should be uploaded -#' @param gsubVal (string) Remove patterns from the result name -#' @param removePattern (string) Restrict to result names with this pattern -#' -#' @return -#' Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema -#' -#' @export -populatePlpResultTables <- function(conn, - resultSchema, - stringAppendToTables = '', - targetDialect = 'postgresql', - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - packageName, - studyJsonList, - studyName = '', - studyDescription = '', - researcherName = '', - researcherEmail = '', - researcherOrg = '', - databaseName = NULL, - databaseAcronym = NULL, - databaseVersion = 1, - databaseDescription = NULL, - databaseType = NULL, - valDatabases = list(ccae = list(name = 'CCAE', - description = '', - version = 1, - type = 'US Claims')), - resultLocation = NULL, - resultPattern = '', - validationLocation = file.path(resultLocation,'Validation'), - addInternalValidation = T, - addExternalValidation = T, - gsubVal = NULL, - removePattern = NULL -){ - - ensure_installed("jsonlite") - - # input checks - ##TODO - if(base::missing(packageName)){ - if(base::missing(studyJsonList)){ - stop('Need either packageName or studyJsonList') - }else{ - if(is.null(studyJsonList)){ - stop('studyJsonList needs to be non-null') - } - cohortType <- 'list' - jsonInput <- studyJsonList - } - } else{ - if(is.null(packageName)){ - stop('packageName needs to be non-null') - } - cohortType <- 'package' - jsonInput <- packageName - } - - if(stringAppendToTables != ''){ - stringAppendToTables <- paste0(toupper(gsub('_','',gsub(' ','', stringAppendToTables))), '_') - } - - studyId <- addStudy(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - studyName = studyName, - studyDescription = studyDescription, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - - ParallelLogger::logInfo(paste0('studyId: ', studyId)) - - researcherId <- addResearcher(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - researcherName = researcherName, - researcherEmail = researcherEmail, - researcherOrg = researcherOrg, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - - ParallelLogger::logInfo(paste0('researcherId: ', researcherId)) - - dbId <- addDatabase(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - databaseName = databaseName, - databaseAcronym = databaseAcronym, - databaseVersion = databaseVersion, - databaseDescription = databaseDescription, - databaseType = databaseType, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('dbId: ', dbId)) - - mdls <- dir(resultLocation, pattern = resultPattern) - removeMdls <- union(grep('.csv', mdls),grep('.txt', mdls)) - if(length(removeMdls)>0){ - mdls <- mdls[-removeMdls] - } - - # remove pattern - if(!is.null(removePattern)){ - mdls <- mdls[-grep(removePattern, mdls)] - } - - for(modelRes in mdls){ - ParallelLogger::logInfo(paste0('Adding results for model @ ', modelRes)) - - # TODO edit csv here - mdl <- tryCatch( - {PatientLevelPrediction::loadPlpResult(file.path(resultLocation, modelRes, 'plpResult'))}, - error = function(e){ParallelLogger::logInfo(e);return(NULL)} - ) - - if(!is.null(mdl)){ - - # add TAR - tarId <- addTar(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - startDay = mdl$model$settings$populationSettings$riskWindowStart, - startAnchor = mdl$model$settings$populationSettings$startAnchor, - endDay = mdl$model$settings$populationSettings$riskWindowEnd, - endAnchor = mdl$model$settings$populationSettings$endAnchor, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('tarId: ', tarId)) - - tId <- addCohort(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - jsonInput = jsonInput, type = cohortType, - cohortId = mdl$model$trainDetails$cohortId, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('tId: ', tId)) - - oId <- addCohort(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - jsonInput = jsonInput, type = cohortType, - cohortId = mdl$model$trainDetails$outcomeId, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('oId: ', oId)) - - popSetId <- addPopulationSetting(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$populationSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('popSetId: ', popSetId)) - - covSetId <- addCovariateSetting(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$covariateSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('covSetId: ', covSetId)) - - modSetId <- addModelSetting(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - modelType = mdl$model$settings$modelSettings$model, - json = mdl$model$settings$modelSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('modSetId: ', modSetId)) - - # NEW: add plp_data_settings - plpDataSetId <- addPlpDataSetting( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$plpDataSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('plpDataSetId: ', plpDataSetId)) - - # NEW: add FE_settings - FESetId <- addFESetting( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$featureEngineering, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('FESetId: ', FESetId)) - - # NEW: add sample_settings - sampleSetId <- addSampleSetting( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$sampleSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('sampleSetId: ', sampleSetId) - ) - - # NEW: add tidy_covariate_settings - tidySetId <- addTidySetting( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$tidyCovariates, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('tidySetId: ', tidySetId)) - - - # this is now split setting - update this function - splitId <- addSplitSettings( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = mdl$model$settings$splitSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('splitId: ', splitId)) - - # create this function - modelDesignId <- addModelDesign( # need to create - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - targetId = tId, - outcomeId = oId, - tarId = tarId, - plpDataSettingId = plpDataSetId, - populationSettingId = popSetId, - modelSettingId = modSetId, - covariateSettingId = covSetId, - sampleSettingId = sampleSetId, - splitSettingId = splitId, # changed from trainingId - featureEngineeringSettingId = FESetId, - tidyCovariatesSettingId = tidySetId, - researcherId = researcherId, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('modelDesignId: ', modelDesignId)) - - # create this function - modelId <- addModel( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - analysisId = mdl$model$trainDetails$analysisId, # trainDetails - modelDesignId = modelDesignId, - researcherId = researcherId, - databaseId = dbId, - hyperParamSearch = mdl$model$trainDetails$hyperParamSearch, #mdl$trainDetails$hyperParamSearch - plpModelFile = " ", - executionDateTime = format(mdl$executionSummary$ExecutionDateTime, format="%Y-%m-%d"), #mdl$trainDetails$trainingDate - trainingTime = mdl$model$trainDetails$trainingTime, #mdl$trainDetails$trainingTime - intercept = ifelse(is.list(mdl$model), mdl$model$model$coefficients[1], 0), - requireDenseMatrix = mdl$model$settings$requireDenseMatrix, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - ParallelLogger::logInfo(paste0('modelId: ', modelId)) - - # add modelId and studyId - addStudiesModel(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - studyId = studyId, - modelId = modelId, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - - # add internalValication - if(addInternalValidation){ - - ParallelLogger::logInfo('Adding internal validation results') - - ##if exists - if(!is.null(mdl)){ - # add attrition here... - resultId <- addResult(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - modelId = modelId, - researcherId = researcherId, - databaseId = dbId, - targetId = tId, - outcomeId = oId, - tarId = tarId, - restrictPlpDataSettingId = plpDataSetId, - populationSettingId = popSetId, - executionDateTime = format(mdl$executionSummary$ExecutionDateTime, format="%Y-%m-%d"), - plpVersion = mdl$executionSummary$PackageVersion$packageVersion, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - ParallelLogger::logInfo(paste0('resultId: ', resultId)) - - # add attriition - if(!is.null(mdl$model$trainDetails$attrition)){ - addAttrition( - conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - resultId = resultId, - attrition = mdl$model$trainDetails$attrition, - overWriteIfExists = T, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - } - - # add eval - if(!is.null(mdl$performanceEvaluation)){ - addEvaluation(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - resultId = resultId, - performanceEvaluation = mdl$performanceEvaluation, - overWriteIfExists = T, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - } - if(!is.null(mdl$covariateSummary)){ - addCovariateSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - resultId = resultId, - covariateSummary = mdl$covariateSummary, - restrictToIncluded = T, - overWriteIfExists = T, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - } - - } - } - - # add validation results for this model - if(addExternalValidation){ - - ParallelLogger::logInfo('Adding external validation results') - - - if(is.null(validationLocation)){ - validationLocation <- file.path(resultLocation, 'validation') - } - valDbs <- dir(validationLocation) - - # restrict to the databases with info - valDbs <- valDbs[valDbs%in%names(valDatabases)] - - if(length(valDbs)>0){ - - valDbs <- valDbs[!valDbs %in% c('plplog.txt')] - - for(valDb in valDbs){ - - #get valDbId - valDbId <- addDatabase(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - databaseName = valDatabases[[valDb]]$name, - databaseAcronym = valDb, - databaseVersion = valDatabases[[valDb]]$version, - databaseDescription = valDatabases[[valDb]]$description, - databaseType =valDatabases[[valDb]]$type, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - - validationResults <- as.character(dir(file.path(validationLocation, valDb))) - validationResults <- validationResults[validationResults != 'CohortCounts.csv'] - - valMods <- data.frame(validationResults = validationResults) - - if(!is.null(gsubVal)){ - valModsEdit <- valMods$validationResults - for(i in 1:nrow(gsubVal)){ - valModsEdit <- gsub(x = valModsEdit, pattern = gsubVal[i,1], replacement = gsubVal[i,2]) - } - valMods$validationResultsEdit <- valModsEdit - }else{ - valMods$validationResultsEdit <- valMods$validationResults - } - - # remove pattern - if(!is.null(removePattern)){ - if(length(grep(removePattern, valMods$validationResultsEdit))>0){ - valMods <- valMods[-grep(removePattern, valMods$validationResultsEdit),] - } - } - - # restrict to analysis - ParallelLogger::logInfo(paste0('restricting to ', modelRes)) - valMods <- valMods[grep(modelRes, valMods$validationResultsEdit),] - - if(nrow(valMods)>0){ - - # load each result - for(valInd in 1:nrow(valMods)){ - - #resultName <- dir(file.path(validationLocation, valDb, valMods$validationResults[valInd])) - #resultName <- resultName[grep('.rds',resultName)] - ParallelLogger::logInfo(paste0('Loading validation at:', file.path(validationLocation, valDb, valMods$validationResults[valInd], 'validationResult' ))) - vmdl <- tryCatch( - {PatientLevelPrediction::loadPlpResult(file.path(validationLocation, valDb, valMods$validationResults[valInd], 'validationResult' ))}, - error = function(e){ParallelLogger::logInfo(e); return(NULL)} - ) - - - if(!is.null(vmdl)){ - tId <- addCohort(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - jsonInput = jsonInput, type = cohortType, - #cohortId = vmdl$model$validationDetails$cohortId, - cohortId = ifelse( - !is.null(vmdl$model$validationDetails$cohortId), - vmdl$model$validationDetails$cohortId, - vmdl$prediction$cohortId[1] - ), - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - oId <- addCohort(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - jsonInput = jsonInput, type = cohortType, - cohortId = vmdl$model$validationDetails$outcomeId, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - - # get tarId (added) - tarId <- addTar( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - startDay = vmdl$model$validationDetails$populationSettings$riskWindowStart, - startAnchor = vmdl$model$validationDetails$populationSettings$startAnchor, - endDay = vmdl$model$validationDetails$populationSettings$riskWindowEnd, - endAnchor = vmdl$model$validationDetails$populationSettings$endAnchor, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - - # popSetId (added) - popSetId <- addPopulationSetting( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - json = vmdl$model$validationDetails$populationSettings, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - - - plpDataSetId <- addPlpDataSetting( - conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema, - json = ifelse( - !is.null(vmdl$model$validationDetails$plpDataSettings), - vmdl$model$validationDetails$plpDataSettings, - vmdl$model$settings$plpDataSettings - ) - ) - - # add result - resultId <- addResult(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - modelId = modelId, - researcherId = researcherId, - databaseId = valDbId, - targetId = tId, - outcomeId = oId, - tarId = tarId, - restrictPlpDataSettingId = plpDataSetId, - populationSettingId = popSetId, - executionDateTime = format(vmdl$executionSummary$ExecutionDateTime, format="%Y-%m-%d"), - plpVersion = vmdl$executionSummary$PackageVersion$packageVersion, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - - - - # add attrition - if(!is.null(vmdl$model$validationDetails$attrition)){ - addAttrition( - conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - resultId = resultId, - attrition = vmdl$model$validationDetails$attrition, - overWriteIfExists = T, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema - ) - } - - # add performance - #============= - if(!is.null(vmdl$performanceEvaluation)){ - addEvaluation(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - resultId = resultId, - performanceEvaluation = vmdl$performanceEvaluation, - overWriteIfExists = T, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - } - - if(!is.null(vmdl$covariateSummary)){ - addCovariateSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - resultId = resultId, - covariateSummary = vmdl$covariateSummary, - restrictToIncluded = T, - overWriteIfExists = T, - stringAppendToTables = stringAppendToTables, - tempEmulationSchema = tempEmulationSchema) - } - - } - - #+++++++++++++ - - - } # end val per database - } # end if val exists - - } # val database - } - - } #externalVal - - - } #model not null - - } # per model - -} #end funct - - -#====================== -# HELPER FUNCTIONS -#====================== -enc <- function(x){ - return(paste0("'", x, "'")) -} - -cleanNum <- function(x){ - types <- unlist(lapply(1:ncol(x), function(i) class(x[,i]))) - - ids <- which(types%in% c("numeric", "integer" )) - - for(id in ids){ - okVals <- is.finite(x[,id]) - - if(sum(okVals)!=length(okVals)){ - x[!okVals,id] <- NA - } - - } - return(x) -} - -checkTable <- function(conn, - resultSchema, - stringAppendToTables = '', - targetDialect, - tableName, - columnNames, - values, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - vals <- paste0(paste0(columnNames," = ", values), collapse = " and ") - - sql <- "SELECT * from @my_schema.@string_to_append@table where @input_vals;" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - table = tableName, - input_vals = vals, - string_to_append = stringAppendToTables) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = T) - - return(result) -} - - -checkJson <- function(conn, - resultSchema, - stringAppendToTables = '', - targetDialect, - tableName, - jsonColumnName, - id, - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - sql <- "SELECT * from @my_schema.@string_to_append@table;" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - table = tableName, - string_to_append = stringAppendToTables) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = T) - - resultId <- NULL - if(nrow(result)>0){ - colId <- result[,jsonColumnName] == json - if(sum(colId)>0){ - resultId <- result[colId,id][1] - } - } - - return(resultId) -} - -# gets the column names in camelCase of a table -getColumnNames <- function(conn, resultSchema, targetDialect, tableName, stringAppendToTables = '', - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - sql <- "select top 1 * from @my_schema.@string_to_append@table;" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - table = tableName, - string_to_append = stringAppendToTables) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = T) - - return(colnames(result)) -} - -# True/False check whether results exist in table -checkResultExists <- function(conn, resultSchema, targetDialect, - snakeCaseToCamelCase, - tableName, - resultId, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - sql <- "select * from @my_schema.@table where result_id = @result_id;" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - table = tableName, - result_id = resultId) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = T) - return(nrow(result)>0) -} - - -#====================== -# end helpers -addStudy <- function(conn, resultSchema, targetDialect, - studyName, studyDescription, - stringAppendToTables = '', - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'studies', - columnNames = c('study_name', 'study_description'), - values = c(paste0("'",studyName,"'"), - paste0("'",studyDescription,"'") - ), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)>0){ - ParallelLogger::logInfo('Study already exists') - } - else{ - ParallelLogger::logInfo(paste0('Adding new study: ', studyName )) - - # add my detail - sql <- "INSERT INTO @my_schema.@string_to_appendstudies(study_name, study_description) - VALUES ('@name','@desc');" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - name = studyName, - desc = studyDescription, - string_to_append = stringAppendToTables - ) - - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - - DatabaseConnector::executeSql(conn, sql) - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'studies', - columnNames = c('study_name', 'study_description'), - values = c(paste0("'",studyName,"'"), - paste0("'",studyDescription,"'") - ), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(result$studyId[1]) - -} - -addStudiesModel <- function(conn, - resultSchema, - targetDialect, - studyId, - modelId, - stringAppendToTables, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'study_models', - columnNames = c('study_id', 'model_id'), - values = c(studyId, modelId), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)>0){ - ParallelLogger::logInfo('Study and model already linked') - } - else{ - ParallelLogger::logInfo(paste0('Adding link between study: ', studyId, ' and model: ', modelId )) - - # add my detail - sql <- "INSERT INTO @my_schema.@string_to_appendstudy_models(study_id, model_id) - VALUES ('@studyid','@modelid');" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - studyid = studyId, - modelid = modelId, - string_to_append = stringAppendToTables - ) - - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - - DatabaseConnector::executeSql(conn, sql) - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'study_models', - columnNames = c('study_id', 'model_id'), - values = c(studyId, modelId), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(invisible(result$studyId[1])) - -} - - -addResearcher <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - researcherName, - researcherEmail, - researcherOrg, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'researchers', - columnNames = c('researcher_name', 'researcher_email', 'researcher_affiliation'), - values = c(paste0("'",researcherName,"'"), - paste0("'",researcherEmail,"'"), - paste0("'",researcherOrg,"'")), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)>0){ - ParallelLogger::logInfo('Researcher already exists') - } - else{ - ParallelLogger::logInfo(paste0('Adding Researcher: ', researcherName )) - - # add my detail - sql <- "INSERT INTO @my_schema.@string_to_appendresearchers(researcher_name, researcher_email, researcher_affiliation) - VALUES ('@name','@email', '@org');" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - name = researcherName, - email = researcherEmail, - org = researcherOrg, - string_to_append = stringAppendToTables - ) - - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - - DatabaseConnector::executeSql(conn, sql) - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'researchers', - columnNames = c('researcher_name', 'researcher_email', 'researcher_affiliation'), - values = c(paste0("'",researcherName,"'"), - paste0("'",researcherEmail,"'"), - paste0("'",researcherOrg,"'")), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(result$researcherId[1]) - -} - - -addDatabase <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - databaseName, - databaseAcronym, - databaseVersion = 1, - databaseDescription, - databaseType, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'database_details', - columnNames = c('database_name', 'database_acronym', - 'database_version', - 'database_description', 'database_type'), - values = c(paste0("'",databaseName,"'"), - paste0("'",databaseAcronym,"'"), - databaseVersion, - paste0("'",databaseDescription,"'"), - paste0("'",databaseType,"'")), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)>0){ - ParallelLogger::logInfo(paste0('Database ', databaseName ,' already exists')) - } else { - - sql <- "INSERT INTO @my_schema.@string_to_appenddatabase_details(database_name, database_acronym, - database_version, - database_description, database_type) - VALUES ('@dbName','@db', @version, '@desc', '@type');" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - dbName = databaseName, - db = databaseAcronym, - version = databaseVersion, - desc = databaseDescription, - type = databaseType, - string_to_append = stringAppendToTables) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'database_details', - columnNames = c('database_name', 'database_acronym', 'database_version', - 'database_description', 'database_type'), - values = c(paste0("'",databaseName,"'"), - paste0("'",databaseAcronym,"'"), - databaseVersion, - paste0("'",databaseDescription,"'"), - paste0("'",databaseType,"'")), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(result$databaseId[1]) - -} - - -addTar <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - startDay, - startAnchor, - endDay, - endAnchor, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'tars', - columnNames = c('tar_start_day', 'tar_start_anchor', - 'tar_end_day', 'tar_end_anchor'), - values = c(startDay, - paste0("'",startAnchor,"'"), - endDay, - paste0("'",endAnchor,"'")), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)==0){ - - ParallelLogger::logInfo('Adding TAR') - # tars - id 1 - sql <- "INSERT INTO @my_schema.@string_to_appendtars(tar_start_day, tar_start_anchor, - tar_end_day, tar_end_anchor) - VALUES (@tar_start_day, @tar_start_anchor, @tar_end_day, @tar_end_anchor);" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - tar_start_day = startDay, - tar_start_anchor = paste0("'",startAnchor,"'"), - tar_end_day = endDay, - tar_end_anchor = paste0("'",endAnchor,"'"), - string_to_append = stringAppendToTables) - - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - - DatabaseConnector::executeSql(conn, sql) - - #getId of new - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'tars', - columnNames = c('tar_start_day', 'tar_start_anchor', - 'tar_end_day', 'tar_end_anchor'), - values = c(startDay, - paste0("'",startAnchor,"'"), - endDay, - paste0("'",endAnchor,"'")), - tempEmulationSchema = tempEmulationSchema - ) - - } else { - ParallelLogger::logInfo('TAR exists') - } - - - return(result$tarId[1]) - -} - - - - - -getCohortFromList <- function(jsonList, cohortId){ - - #cohort_name, cohort_id and cohort_json - ParallelLogger::logInfo(paste0('Adding cohorts from input list')) - id <- which(unlist(lapply(jsonList, function(x){x$cohort_id == cohortId})))[1] - - json <- jsonList[[id]]$cohort_json - - details <- data.frame( - cohortName = jsonList[[id]]$cohort_name, - cohortId = jsonList[[id]]$cohort_id, - webApiCohortId = jsonList[[id]]$cohort_id - ) - - return(list(json = json, - cohortTocreate = details)) -} - -# this can be simplified now we use cohort id as the json file name: -getCohortFromPackage <- function(packageName, cohortId){ - - ParallelLogger::logInfo(paste0('Adding cohorts from ', packageName)) - # check packageName - if(!dir.exists(system.file(package = packageName))){ - stop('Package path not found - set pckPath input to the location of the study package you executed') - } else { - - ParallelLogger::logInfo(paste0('Extracting cohort ',cohortId,' json from ', packageName)) - # check required files: - cohortToCreateLoc <- system.file('Cohorts.csv', # updated for new skeleton - package = packageName) - - if(!file.exists(cohortToCreateLoc)){ - stop('No Cohorts.csv in package') - } - - if(!dir.exists(file.path(system.file(package = packageName), 'cohorts'))){ - stop('No cohorts in package') - } - } - - - # add the cohorts and store the map atlas_id, cohort_id, cohort_name - cohortsToCreate <- utils::read.csv(cohortToCreateLoc) - cohortTocreate <- cohortsToCreate[cohortsToCreate$cohortId == cohortId,] - - jsonFileName <- file.path(system.file(package = packageName), 'cohorts', paste0(cohortTocreate$cohortId, '.json')) - json <- readChar(jsonFileName, file.info(jsonFileName)$size) - - - return(list(json = json, - cohortTocreate = cohortTocreate)) -} - - -# adds json from package unless json is specified -addCohort <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - jsonInput, type = 'package', - cohortId, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(type == 'package'){ - object <- getCohortFromPackage(packageName = jsonInput, cohortId) - } else{ - object <- getCohortFromList(jsonList = jsonInput, cohortId) - } - - json <- object$json - cohortTocreate <- object$cohortTocreate - - # make sure the json has been converted - if(class(json)!='character'){ - ParallelLogger::logInfo('converting json to character') - json <- jsonlite::serializeJSON(json, digits = 23) - } - - # reduce the size to save - json <- substr(json, 1, 4000) # TESTING - FIX THIS [TODO] - - #check whether cohort already in table: - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'cohorts', - columnNames = c('cohort_name', 'atlas_id'), - values = c(paste0("'",cohortTocreate$cohortName[1],"'"), cohortTocreate$cohortId[1]), - tempEmulationSchema = tempEmulationSchema - ) - - addNew <- F - if(nrow(result)>0){ - addNew <- json %in% result$cohortJson - ParallelLogger::logInfo(paste0('json in jsons:', addNew)) - } - - if(addNew){ - ParallelLogger::logInfo(paste0('Cohort ',cohortTocreate$cohortName,' exists in result database with id', result$cohortId)) - } else{ - ParallelLogger::logInfo(paste0('Adding cohort ',cohortTocreate$cohortName[1])) - - data <- data.frame(cohortName = cohortTocreate$cohortName, - atlasId = cohortTocreate$cohortId, - cohortJson = json) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'cohorts'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - # now check and get id - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'cohorts', - columnNames = c('cohort_name', 'atlas_id'), - values = c(paste0("'",cohortTocreate$cohortName,"'"), cohortTocreate$cohortId), - tempEmulationSchema = tempEmulationSchema - ) - - jsonInd <- result$cohortJson %in% json - result <- result[jsonInd,] - - } - - return(result$cohortId[1]) -} - - -addPopulationSetting <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - # process json to make it ordered... - # make sure the json has been converted - if(class(json)!='character'){ - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'population_settings', - jsonColumnName = 'populationSettingsJson', - id = 'populationSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - if(is.null(jsonId)){ - ParallelLogger::logInfo('Adding new population settings') - - data <- data.frame(populationSettingsJson = json) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'population_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'population_settings', - jsonColumnName = 'populationSettingsJson', - id = 'populationSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - } else{ - ParallelLogger::logInfo('Population settings exists') - } - - return(jsonId) -} - - -addCovariateSetting <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - # process json to make it ordered... - # make sure the json has been converted - if(class(json)!='character'){ - # this code created character that is too long for redshfit - #json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - # add attributes - if(class(json) == 'covariateSettings'){ - json <- list(json) - } - json <- lapply(json, addAttributes) - #convert - json <- jsonlite::toJSON( - x = json, - pretty = T, - digits = 23, - auto_unbox=TRUE, - null = "null" - ) - json <- as.character(json) # now convert to character - print(nchar(json)) - } - - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'covariate_settings', - jsonColumnName = 'covariateSettingsJson', - id = 'covariateSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new covariate settings') - - data <- data.frame(covariateSettingsJson = json) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'covariate_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'covariate_settings', - jsonColumnName = 'covariateSettingsJson', - id = 'covariateSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - } else{ - ParallelLogger::logInfo('Covariate setting exists') - } - - return(jsonId) -} - - -addModelSetting <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - modelType, json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - # process json to make it ordered... - # make sure the json has been converted - if(class(json)!='character'){ - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'model_settings', - jsonColumnName = 'modelSettingsJson', - id = 'modelSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new model settings') - - data <- data.frame(modelType = modelType, - modelSettingsJson = json) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'model_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema) - - #getId of new - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'model_settings', - jsonColumnName = 'modelSettingsJson', - id = 'modelSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - } else{ - ParallelLogger::logInfo('Model setting exists') - } - - return(jsonId) -} - -addTidySetting <- function( - conn, - resultSchema, - targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(class(json)!='character'){ - - #modify to make smaller but keep key part - json$deletedInfrequentCovariateIds <- c() - json$normFactors <- json$normFactors %>% dplyr::filter(.data$maxValue !=1) - - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'tidy_covariates_settings', - jsonColumnName = 'tidyCovariatesSettingsJson', - id = 'tidyCovariatesSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new tidy covariates settings') - - data <- data.frame( - tidyCovariatesSettingsJson = json - ) - - DatabaseConnector::insertTable( - connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'tidy_covariates_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'tidy_covariates_settings', - jsonColumnName = 'tidyCovariatesSettingsJson', - id = 'tidyCovariatesSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - } else{ - ParallelLogger::logInfo('tidy covariates setting exists') - } - - return(jsonId) - -} - -addSampleSetting <- function( - conn, - resultSchema, - targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(class(json)!='character'){ - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'sample_settings', - jsonColumnName = 'sampleSettingsJson', - id = 'sampleSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new sample settings') - - data <- data.frame( - sampleSettingsJson = json - ) - - DatabaseConnector::insertTable( - connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'sample_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'sample_settings', - jsonColumnName = 'sampleSettingsJson', - id = 'sampleSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - } else{ - ParallelLogger::logInfo('sample setting exists') - } - - return(jsonId) - -} - -addPlpDataSetting <- function( - conn, - resultSchema, - targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(class(json)!='character'){ - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'plp_data_settings', - jsonColumnName = 'plpDataSettingsJson', - id = 'plpDataSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new plp data settings') - - data <- data.frame( - plpDataSettingsJson = json - ) - - DatabaseConnector::insertTable( - connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'plp_data_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'plp_data_settings', - jsonColumnName = 'plpDataSettingsJson', - id = 'plpDataSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - } else{ - ParallelLogger::logInfo('Split setting exists') - } - - return(jsonId) - -} - -addFESetting <- function( - conn, - resultSchema, - targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(class(json)!='character'){ - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'feature_engineering_settings', - jsonColumnName = 'featureEngineeringSettingsJson', - id = 'featureEngineeringSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new feature_engineering settings') - - data <- data.frame( - featureEngineeringSettingsJson = json - ) - - DatabaseConnector::insertTable( - connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'feature_engineering_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'feature_engineering_settings', - jsonColumnName = 'featureEngineeringSettingsJson', - id = 'featureEngineeringSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - } else{ - ParallelLogger::logInfo('feature engineering setting exists') - } - - return(jsonId) - -} - -addSplitSettings <- function( - conn, - resultSchema, - targetDialect, - stringAppendToTables = '', - json, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(class(json)!='character'){ - json <- as.character(jsonlite::serializeJSON(json, digits = 23)) - } - - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'split_settings', - jsonColumnName = 'splitSettingsJson', - id = 'splitSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - if(is.null(jsonId)){ - - ParallelLogger::logInfo('Adding new split settings') - - data <- data.frame( - splitSettingsJson = json - ) - - DatabaseConnector::insertTable( - connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables, 'split_settings'), - data = data, - dropTableIfExists = F, - createTable = F, - tempTable = F, - progressBar = T, - camelCaseToSnakeCase = T, - tempEmulationSchema = tempEmulationSchema - ) - - #getId of new - jsonId <- checkJson( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'split_settings', - jsonColumnName = 'splitSettingsJson', - id = 'splitSettingId', - json = json, - tempEmulationSchema = tempEmulationSchema - ) - - } else{ - ParallelLogger::logInfo('Split setting exists') - } - - return(jsonId) - -} - - -addModelDesign <- function( - conn, - resultSchema, targetDialect, - stringAppendToTables = stringAppendToTables, - targetId, - outcomeId, - tarId, - plpDataSettingId, - populationSettingId, - modelSettingId, - covariateSettingId, - sampleSettingId, - splitSettingId, - featureEngineeringSettingId, - tidyCovariatesSettingId, - researcherId, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(is.null(targetId)){ - stop('targetId is null') - } - if(is.null(outcomeId)){ - stop('outcomeId is null') - } - if(is.null(tarId)){ - stop('tarId is null') - } - - if(is.null(plpDataSettingId)){ - stop('plpDataSettingId is null') - } - if(is.null(populationSettingId)){ - stop('populationSettingId is null') - } - if(is.null(modelSettingId)){ - stop('modelSettingId is null') - } - if(is.null(covariateSettingId)){ - stop('covariateSettingId is null') - } - if(is.null(sampleSettingId)){ - stop('sampleSettingId is null') - } - if(is.null(splitSettingId)){ - stop('splitSettingId is null') - } - if(is.null(featureEngineeringSettingId)){ - stop('featureEngineeringSettingId is null') - } - if(is.null(tidyCovariatesSettingId)){ - stop('tidyCovariatesSettingId is null') - } - - if(is.null(researcherId)){ - stop('researcherId is null') - } - - # process json to make it ordered... - # TODO - - result <- checkTable( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'model_designs', - columnNames = c( - 'target_id', - 'outcome_id', - 'tar_id', - 'plp_data_setting_id', - 'population_setting_id', - 'model_setting_id', - 'covariate_setting_id', - 'sample_setting_id', - 'split_setting_id', - 'feature_engineering_setting_id', - 'tidy_covariates_setting_id', - 'researcher_id' - ), - values = c( - targetId, - outcomeId, - tarId, - plpDataSettingId, - populationSettingId, - modelSettingId, - covariateSettingId, - sampleSettingId, - splitSettingId, - featureEngineeringSettingId, - tidyCovariatesSettingId, - researcherId - ), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)==0){ - # model - sql <- "INSERT INTO @my_schema.@string_to_appendmodel_designs( - target_id, - outcome_id, - tar_id, - plp_data_setting_id, - population_setting_id, - model_setting_id, - covariate_setting_id, - sample_setting_id, - split_setting_id, - feature_engineering_setting_id, - tidy_covariates_setting_id, - researcher_id - ) VALUES - ( - @target_id, - @outcome_id, - @tar_id, - @plp_data_setting_id, - @population_setting_id, - @model_setting_id, - @covariate_setting_id, - @sample_setting_id, - @split_setting_id, - @feature_engineering_setting_id, - @tidy_covariates_setting_id, - @researcher_id - )" - sql <- SqlRender::render( - sql, - my_schema = resultSchema, - target_id = targetId, - outcome_id = outcomeId, - tar_id = tarId, - plp_data_setting_id= plpDataSettingId, - population_setting_id = populationSettingId, - model_setting_id = modelSettingId, - covariate_setting_id = covariateSettingId, - sample_setting_id = sampleSettingId, - split_setting_id = splitSettingId, - feature_engineering_setting_id = featureEngineeringSettingId, - tidy_covariates_setting_id = tidyCovariatesSettingId, - researcher_id = researcherId, - string_to_append = stringAppendToTables - ) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - - #getId of new - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'model_designs', - columnNames = c( - 'target_id', - 'outcome_id', - 'tar_id', - 'plp_data_setting_id', - 'population_setting_id', - 'model_setting_id', - 'covariate_setting_id', - 'sample_setting_id', - 'split_setting_id', - 'feature_engineering_setting_id', - 'tidy_covariates_setting_id', - 'researcher_id'), - values = c(targetId, - outcomeId, - tarId, - plpDataSettingId, - populationSettingId, - modelSettingId, - covariateSettingId, - sampleSettingId, - splitSettingId, - featureEngineeringSettingId, - tidyCovariatesSettingId, - researcherId), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(result$modelDesignId[1]) -} - -addModel <- function( - conn, - resultSchema, - targetDialect, - stringAppendToTables = stringAppendToTables, - analysisId, - modelDesignId, - researcherId, - databaseId, - hyperParamSearch, - plpModelFile, - executionDateTime, - trainingTime, - intercept, - requireDenseMatrix, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") -){ - - if(is.null(analysisId)){ - stop('analysisId is null') - } - if(is.null(modelDesignId)){ - stop('modelName is null') - } - if(is.null(researcherId)){ - stop('researcherId is null') - } - if(is.null(databaseId)){ - stop('databaseId is null') - } - if(is.null(plpModelFile)){ - stop('plpModelFile is null') - } - if(is.null(executionDateTime)){ - stop('executionDateTime is null') - } - if(is.null(intercept)){ - stop('intercept is null') - } - - if(!is.null(hyperParamSearch)){ - if(class(hyperParamSearch) != 'character'){ - hyperParamSearch <- as.character(jsonlite::serializeJSON(hyperParamSearch, digits = 23)) - } - }else{ - hyperParamSearch <- '' - } - - # process json to make it ordered... - # TODO - - result <- checkTable( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'models', - columnNames = c( - 'analysis_id', - 'model_design_id', - 'researcher_id', - 'database_id', - 'hyper_param_search', - 'plp_model_file', - 'execution_date_time', - 'training_time', - 'intercept', - 'require_dense_matrix' - ), - values = c( - enc(analysisId), - modelDesignId, - researcherId, - databaseId, - enc(hyperParamSearch), - enc(plpModelFile), - enc(executionDateTime), - enc(trainingTime), - intercept, - ifelse(requireDenseMatrix, "'T'", "'F'") - ), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)==0){ - # model - sql <- "INSERT INTO @my_schema.@string_to_appendmodels( - analysis_id, - model_design_id, - researcher_id, - database_id, - hyper_param_search, - plp_model_file, - execution_date_time, - training_time, - intercept, - require_dense_matrix - ) VALUES - ('@analysis_id', - '@model_design_id', - @researcher_id, - @database_id, - '@hyper_param_search', - '@plp_model_file', - '@execution_date_time', - '@training_time', - @intercept, - '@require_dense_matrix' - )" - sql <- SqlRender::render( - sql, - my_schema = resultSchema, - analysis_id = analysisId, - model_design_id = modelDesignId, - researcher_id = researcherId, - database_id = databaseId, - hyper_param_search = hyperParamSearch, - plp_model_file = plpModelFile, - execution_date_time = executionDateTime, - training_time = trainingTime, - intercept = intercept, - require_dense_matrix = ifelse(requireDenseMatrix, 'T', 'F'), - string_to_append = stringAppendToTables - ) - sql <- SqlRender::translate( - sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema - ) - DatabaseConnector::executeSql(conn, sql) - - #getId of new - result <- checkTable( - conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'models', - columnNames = c( - 'analysis_id', - 'model_design_id', - 'researcher_id', - 'database_id', - 'hyper_param_search', - 'plp_model_file', - 'execution_date_time', - 'training_time', - 'intercept', - 'require_dense_matrix' - ), - values = c( - enc(analysisId), - modelDesignId, - researcherId, - databaseId, - enc(hyperParamSearch), - enc(plpModelFile), - enc(executionDateTime), - enc(trainingTime), - intercept, - ifelse(requireDenseMatrix, "'T'", "'F'") - ), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(result$modelId[1]) -} - -addResult <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - modelId, - researcherId, - databaseId, - targetId, - outcomeId, - tarId, - restrictPlpDataSettingId, - populationSettingId, - executionDateTime, - plpVersion, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'results', - columnNames = c('model_id', - 'researcher_id', - 'database_id', - 'target_id', - 'outcome_id', - 'tar_id', - 'plp_data_setting_id', - 'population_setting_id', - 'execution_date_time', - 'plp_version'), - values = c(modelId, - researcherId, - databaseId, - targetId, - outcomeId, - tarId, - restrictPlpDataSettingId, - populationSettingId, - enc(executionDateTime), - enc(plpVersion)), - tempEmulationSchema = tempEmulationSchema - ) - - if(nrow(result)==0){ - # model - sql <- "INSERT INTO @my_schema.@string_to_appendresults ( - model_id, - researcher_id, - database_id, - target_id, - outcome_id, - tar_id, - plp_data_setting_id, - population_setting_id, - execution_date_time, - plp_version - ) - VALUES (@model_id, @researcher_id, @database_id, @target_id, @outcome_id, @tar_id, - @plp_data_setting_id, @population_setting_id, '@execution_date_time', '@plp_version')" - sql <- SqlRender::render(sql, - my_schema = resultSchema, - model_id = modelId, - researcher_id = researcherId, - database_id = databaseId, - target_id = targetId, - outcome_id = outcomeId, - tar_id = tarId, - plp_data_setting_id = restrictPlpDataSettingId, - population_setting_id = populationSettingId, - execution_date_time = executionDateTime, - plp_version = plpVersion, - string_to_append = stringAppendToTables) - sql <- SqlRender::translate(sql, targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - - #getId of new - result <- checkTable(conn = conn, - resultSchema = resultSchema, - stringAppendToTables = stringAppendToTables, - targetDialect = targetDialect, - tableName = 'results', - columnNames = c('model_id', - 'researcher_id', - 'database_id', - 'target_id', - 'outcome_id', - 'tar_id', - 'plp_data_setting_id', - 'population_setting_id', - 'execution_date_time', - 'plp_version'), - values = c(modelId, - researcherId, - databaseId, - targetId, - outcomeId, - tarId, - restrictPlpDataSettingId, - populationSettingId, - enc(executionDateTime), - enc(plpVersion)), - tempEmulationSchema = tempEmulationSchema - ) - - } - - return(result$resultId[1]) -} - -# attrition -addAttrition <- function( - conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - attrition, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - value <- attrition - if(is.null(value)){ - return(NULL) - } - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - - value$resultId <- resultId - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'attrition'), - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'attrition'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_id=resultId, - result_schema = resultSchema, - table_name = paste0(stringAppendToTables,'attrition')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting attrition for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'attrition'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - - -# evals -addEvaluation <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - performanceEvaluation, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - ParallelLogger::logInfo('Adding PredictionDistribution') - tryCatch({addPredictionDistribution(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - resultId = resultId, - performanceEvaluation = performanceEvaluation, - overWriteIfExists = overWriteIfExists, - tempEmulationSchema = tempEmulationSchema)}, - error = function(e){ParallelLogger::logError(e);}) - - ParallelLogger::logInfo('Adding ThresholdSummary') - tryCatch({addThresholdSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - resultId = resultId, - performanceEvaluation = performanceEvaluation, - overWriteIfExists = overWriteIfExists, - tempEmulationSchema = tempEmulationSchema)}, - error = function(e){ParallelLogger::logError(e);}) - - ParallelLogger::logInfo('Adding EvaluationStatistics') - tryCatch({addEvaluationStatistics(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - resultId = resultId, - performanceEvaluation = performanceEvaluation, - overWriteIfExists = overWriteIfExists, - tempEmulationSchema = tempEmulationSchema)}, - error = function(e){ParallelLogger::logError(e);}) - - ParallelLogger::logInfo('Adding CalibrationSummary') - tryCatch({addCalibrationSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - resultId = resultId, - performanceEvaluation = performanceEvaluation, - overWriteIfExists = overWriteIfExists, - tempEmulationSchema = tempEmulationSchema)}, - error = function(e){ParallelLogger::logError(e);}) - - ParallelLogger::logInfo('Adding DemographicSummary') - tryCatch({addDemographicSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - resultId = resultId, - performanceEvaluation = performanceEvaluation, - overWriteIfExists = overWriteIfExists, - tempEmulationSchema = tempEmulationSchema)}, - error = function(e){ParallelLogger::logError(e);}) - - return(invisible(NULL)) - -} - -addPredictionDistribution <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - performanceEvaluation, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - value <- performanceEvaluation$predictionDistribution - if(is.null(value)){ - return(NULL) - } - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - if(sum(colnames(value)=='class')>0){ - colnames(value)[colnames(value)=='class'] <- 'classLabel' - } - - value$resultId <- resultId - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'prediction_distribution'), - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'prediction_distribution'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_id=resultId, - result_schema = resultSchema, - table_name = paste0(stringAppendToTables,'prediction_distribution')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting predictionDistribution for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'prediction_distribution'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - -addThresholdSummary <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - performanceEvaluation, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - - value <- performanceEvaluation$thresholdSummary - if(is.null(value)){ - return(NULL) - } - - # check numerical columns: - value <- cleanNum(value) - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - value$resultId <- resultId - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - tableName = 'threshold_summary', - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'threshold_summary'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_schema = resultSchema, - result_id = resultId, - table_name = paste0(stringAppendToTables,'threshold_summary')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting thresholdSummary for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'threshold_summary'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - - -addCalibrationSummary <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - performanceEvaluation, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - - value <- performanceEvaluation$calibrationSummary - if(is.null(value)){ - return(NULL) - } - - # check numerical columns: - value <- cleanNum(value) - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - - value$resultId <- resultId - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - tableName = 'calibration_summary', - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'calibration_summary'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_schema = resultSchema, - result_id=resultId, - table_name = paste0(stringAppendToTables,'calibration_summary')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting calibrationSummary for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'calibration_summary'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - -addEvaluationStatistics <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - performanceEvaluation, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - - value <- data.frame( - evaluation = unlist(performanceEvaluation$evaluationStatistics$evaluation), - metric = unlist(performanceEvaluation$evaluationStatistics$metric), - value = as.numeric(unlist(performanceEvaluation$evaluationStatistics$value)) - ) - - if(is.null(value)){ - return(NULL) - } - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - value$resultId <- resultId - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - tableName = 'evaluation_statistics', - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'evaluation_statistics'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_schema = resultSchema, - result_id = resultId, - table_name = paste0(stringAppendToTables,'evaluation_statistics')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting evaluationSummary for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'evaluation_statistics'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - -addDemographicSummary <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - performanceEvaluation, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - - value <- performanceEvaluation$demographicSummary - if(is.null(value)){ - return(NULL) - } - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - #if(sum(colnames(value)=="p50PredictedProbability")>0){ - # colnames(value)[colnames(value)=="p50PredictedProbability"] <- 'medianPredictedProbability' - #} - - value$resultId <- resultId - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - tableName = 'demographic_summary', - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'demographic_summary'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_schema = resultSchema, - result_id = resultId, - table_name = paste0(stringAppendToTables,'demographic_summary')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting demographicSummary for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'demographic_summary'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - -addCovariateSummary <- function(conn, resultSchema, targetDialect, - stringAppendToTables = '', - resultId, - covariateSummary, - restrictToIncluded = T, - overWriteIfExists = T, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ - - - value <- covariateSummary - if(is.null(value)){ - return(NULL) - } - - # edit names - firstLower <- function(x) { - substr(x, 1, 1) <- tolower(substr(x, 1, 1)) - return(x) - } - colnames(value) <- sapply(colnames(value), firstLower ) - value$resultId <- resultId - # remove _ from names - colnames(value) <- gsub('_','', colnames(value)) - - if(restrictToIncluded){ - ParallelLogger::logInfo('Restricting to covariates included in model') - value <- value[value$covariateValue!=0 & !is.na(value$covariateValue),] - } - - # get column names and check all present in object - columnNames <- getColumnNames(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - stringAppendToTables = stringAppendToTables, - tableName = 'covariate_summary', - tempEmulationSchema = tempEmulationSchema) - isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) - - exists <- checkResultExists(conn = conn, - resultSchema = resultSchema, - targetDialect = targetDialect, - tableName = paste0(stringAppendToTables,'covariate_summary'), - resultId = resultId, - tempEmulationSchema = tempEmulationSchema) - - if(isValid && (!exists || overWriteIfExists)){ - - # REMOVE existing result - if(exists){ - ParallelLogger::logTrace('Removing existing covariateSummary') - sql <- "delete from @result_schema.@table_name where result_id = @result_id;" - sql <- SqlRender::render(sql, - result_schema = resultSchema, - result_id = resultId, - table_name = paste0(stringAppendToTables,'covariate_summary')) - sql <- SqlRender::translate(sql, - targetDialect = targetDialect, - tempEmulationSchema = tempEmulationSchema) - DatabaseConnector::executeSql(conn, sql) - } - - # add - ParallelLogger::logInfo(paste0('Inserting covariateSummary for result ',resultId)) - DatabaseConnector::insertTable(connection = conn, - databaseSchema = resultSchema, - tableName = paste0(stringAppendToTables,'covariate_summary'), - data = value[,columnNames], - dropTableIfExists = F, createTable = F, tempTable = F, - bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, - tempEmulationSchema = tempEmulationSchema) - } - - return(invisible(NULL)) -} - diff --git a/R/uploadToDatabase.R b/R/uploadToDatabase.R new file mode 100644 index 000000000..9f94f1464 --- /dev/null +++ b/R/uploadToDatabase.R @@ -0,0 +1,1370 @@ +# @file UploadPlpDbResults.R +# +# Copyright 2021 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +insertRunPlpToSqlite <- function( + runPlp, + externalValidatePlp = NULL, + diagnosePlp = NULL + ){ + + sqliteLocation <- tempdir() + + ensure_installed('RSQLite') + + # create sqlite database + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = file.path(sqliteLocation,'databaseFile.sqlite') + ) + + createPlpResultTables( + connectionDetails = connectionDetails, + targetDialect = 'sqlite', + resultSchema = 'main', + deleteTables = T, + createTables = T, + tablePrefix = '' + ) + + #cohortDefinitions <- data.frame( + # cohortId = c(runPlp$model$modelDesign$targetId, runPlp$model$modelDesign$outcomeId), + # cohortName = c('Target', 'Outcome'), + # json = c('{}', '{}') + # ) + + addRunPlpToDatabase( + runPlp = runPlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + cohortDefinitions = NULL,#cohortDefinitions, + databaseList = NULL, + modelSaveLocation = sqliteLocation + ) + + # add validation results if entered + if(!is.null(externalValidatePlp)){ + if(inherits(x = externalValidatePlp, what = 'list')){ + for(i in 1:length(externalValidatePlp)){ + tryCatch( + { + addRunPlpToDatabase( + runPlp = externalValidatePlp[[i]], + connectionDetails = connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + cohortDefinitions = NULL,#cohortDefinitions, + databaseList = NULL, + modelSaveLocation = sqliteLocation + ) + }, error = function(e){ParallelLogger::logError(e)} + ) + } + } + } + + # add diagnosis results if entered + if(!is.null(diagnosePlp)){ + tryCatch( + { + addDiagnosePlpToDatabase( + diagnosePlp = diagnosePlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + cohortDefinitions = NULL,#cohortDefinitions, + databaseList = NULL + ) + }, error = function(e){ParallelLogger::logError(e)} + ) + } + + + return(file.path(sqliteLocation,'databaseFile.sqlite')) +} + +#' Create sqlite database with the results +#' @description +#' This function create an sqlite database with the PLP result schema and inserts all results +#' +#' @details +#' This function can be used upload PatientLevelPrediction results into an sqlite database +#' +#' @param resultLocation (string) location of directory where the main package results were saved +#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet() +#' @param databaseList A list created by \code{createDatabaseList} to specify the databases +#' @param sqliteLocation (string) location of directory where the sqlite database will be saved +#' +#' @return +#' Returns the location of the sqlite database file +#' +#' @export +insertResultsToSqlite <- function( + resultLocation, + cohortDefinitions, + databaseList = NULL, + sqliteLocation = file.path(resultLocation, 'sqlite') +){ + + if(!dir.exists(sqliteLocation)){ + dir.create(sqliteLocation, recursive = T) + } + + ensure_installed('RSQLite') + + # create sqlite database + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = file.path(sqliteLocation,'databaseFile.sqlite') + ) + + # create tables if they dont exist + createPlpResultTables( + connectionDetails = connectionDetails, + targetDialect = 'sqlite', + resultSchema = 'main', + deleteTables = T, + createTables = T, + tablePrefix = '' + ) + + # run insert models + addMultipleRunPlpToDatabase( + connectionDetails = connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + cohortDefinitions = cohortDefinitions, + databaseList = databaseList, + resultLocation = resultLocation, + modelSaveLocation = sqliteLocation + ) + + # run insert diagnosis + addMultipleDiagnosePlpToDatabase( + connectionDetails = connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + cohortDefinitions = cohortDefinitions, + databaseList = databaseList, + resultLocation = resultLocation + ) + + return(file.path(sqliteLocation,'databaseFile.sqlite')) +} + +#' Create the results tables to store PatientLevelPrediction models and results into a database +#' @description +#' This function executes a large set of SQL statements to create tables that can store models and results +#' +#' @details +#' This function can be used to create (or delete) PatientLevelPrediction result tables +#' +#' @param connectionDetails The database connection details +#' @param targetDialect The database management system being used +#' @param resultSchema The name of the database schema that the result tables will be created. +#' @param deleteTables If true any existing tables matching the PatientLevelPrediction result tables names will be deleted +#' @param createTables If true the PatientLevelPrediction result tables will be created +#' @param tablePrefix A string that appends to the PatientLevelPrediction result tables +#' @param tempEmulationSchema The temp schema used when the database management system is oracle +#' +#' @param testFile (used for testing) The location of an sql file with the table creation code +#' +#' @return +#' Returns NULL but creates the required tables into the specified database schema(s). +#' +#' @export +createPlpResultTables <- function( + connectionDetails, + targetDialect = 'postgresql', + resultSchema, + deleteTables = T, + createTables = T, + tablePrefix = '', + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + testFile = NULL +){ + + conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) + on.exit(DatabaseConnector::disconnect(conn)) + + tablesExists <- sum(tolower(getPlpResultTables()) %in% tolower(DatabaseConnector::getTableNames(connection = conn, databaseSchema = resultSchema))) + tablesExists <- tablesExists == length(getPlpResultTables()) + + if(!tablesExists){ + ParallelLogger::logInfo('All or some PLP result tables do not exist, tables being recreated') + if(deleteTables){ + ParallelLogger::logInfo('Deleting existing tables') + + tableNames <- getPlpResultTables() + + deleteTables( + conn = conn, + databaseSchema = resultSchema, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema, + tableNames = tableNames, + tablePrefix = tablePrefix + ) + + } + + + if(createTables){ + ParallelLogger::logInfo('Creating PLP results tables') + + if(tablePrefix != ''){ + tablePrefix <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefix))), '_') + } + + pathToSql <- system.file( + paste("sql/", targetDialect, + sep = ""), + "PlpResultTables.sql", + package = "PatientLevelPrediction" + ) + + if(!file.exists(pathToSql)){ + # if no dbms specific file use sql_server + pathToSql <- system.file( + paste("sql/", 'sql_server', + sep = ""), + "PlpResultTables.sql", + package = "PatientLevelPrediction" + ) + } + + sql <- readChar(pathToSql, file.info(pathToSql)$size) + renderedSql <- SqlRender::render( + sql = sql[1], + my_schema = resultSchema, + string_to_append = tablePrefix + ) + renderedSql <- SqlRender::translate( + sql = renderedSql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema + ) + + DatabaseConnector::executeSql(conn, renderedSql) + } + + } else{ + ParallelLogger::logInfo('PLP result tables already exist') + } + + # then migrate + ParallelLogger::logInfo('PLP result migrration being applied') + migrateDataModel( + connectionDetails = connectionDetails, # input is connection + databaseSchema = resultSchema, + tablePrefix = tablePrefix + ) + +} + +#' Populate the PatientLevelPrediction results tables +#' @description +#' This function formats and uploads results that have been generated via an ATLAS prediction package into a database +#' +#' @details +#' This function can be used upload PatientLevelPrediction results into a database +#' +#' @param connectionDetails A connection details created by using the +#' function \code{createConnectionDetails} in the +#' \code{DatabaseConnector} package. +#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables +#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet() +#' @param databaseList (Optional) A list created by \code{createDatabaseList} to specify the databases +#' @param resultLocation (string) location of directory where the main package results were saved +#' @param resultLocationVector (only used when resultLocation is missing) a vector of locations with development or validation results +#' @param modelSaveLocation The location of the file system for saving the models in a subdirectory +#' +#' @return +#' Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema +#' +#' @export +addMultipleRunPlpToDatabase <- function( + connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + cohortDefinitions, + databaseList = NULL, + resultLocation = NULL, + resultLocationVector, + modelSaveLocation +){ + conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) + on.exit(DatabaseConnector::disconnect(conn)) + + # for each development result add it to the database: + + if(missing(resultLocationVector)){ + resultLocationVector <- getResultLocations(resultLocation) + } + + if(length(resultLocationVector) == 0){ + ParallelLogger::logInfo('No results found') + return(NULL) + } + + for(runPlpLocation in resultLocationVector){ + ParallelLogger::logInfo(paste0('Inserting result @ ', runPlpLocation, ' into database')) + + # TODO edit csv here + runPlp <- tryCatch( + {PatientLevelPrediction::loadPlpResult(runPlpLocation)}, + error = function(e){ParallelLogger::logInfo(e);return(NULL)} + ) + + if(!is.null(runPlp)){ + ParallelLogger::logInfo('result loaded') + + # Add runPlp to the database + tryCatch( + {addRunPlpToDatabase( + runPlp = runPlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions, + databaseList = databaseList, + modelSaveLocation = modelSaveLocation + )}, error = function(e){ + ParallelLogger::logInfo('result upload failed: '); + ParallelLogger::logInfo(e) + } + ) + + } #model not null + + } # per model + +} #end funct + + +#' Create the PatientLevelPrediction database result schema settings +#' @description +#' This function specifies where the results schema is and lets you pick a different schema for the cohorts and databases +#' +#' @details +#' This function can be used to specify the database settings used to upload PatientLevelPrediction results into a database +#' +#' @param resultSchema (string) The name of the database schema with the result tables. +#' @param tablePrefix (string) A string that appends to the PatientLevelPrediction result tables +#' @param targetDialect (string) The database management system being used +#' @param tempEmulationSchema (string) The temp schema used when the database management system is oracle +#' @param cohortDefinitionSchema (string) The name of the database schema with the cohort definition tables (defaults to resultSchema). +#' @param tablePrefixCohortDefinitionTables (string) A string that appends to the cohort definition tables +#' @param databaseDefinitionSchema (string) The name of the database schema with the database definition tables (defaults to resultSchema). +#' @param tablePrefixDatabaseDefinitionTables (string) A string that appends to the database definition tables +#' +#' @return +#' Returns a list of class 'plpDatabaseResultSchema' with all the database settings +#' +#' @export +createDatabaseSchemaSettings <- function( + resultSchema = 'main', + tablePrefix = '', + targetDialect = 'sqlite', + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + cohortDefinitionSchema = resultSchema, + tablePrefixCohortDefinitionTables = tablePrefix, + databaseDefinitionSchema = resultSchema, + tablePrefixDatabaseDefinitionTables = tablePrefix +){ + + if(missing(resultSchema)){ + stop('resultSchema required') + } + if(!inherits(x = resultSchema, what = "character")){ + stop('resultSchema must be a string') + } + + if(tablePrefix != ''){ + tablePrefix <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefix))), '_') + } + if(tablePrefixCohortDefinitionTables != ''){ + tablePrefixCohortDefinitionTables <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefixCohortDefinitionTables))), '_') + } + if(tablePrefixDatabaseDefinitionTables != ''){ + tablePrefixDatabaseDefinitionTables <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefixDatabaseDefinitionTables))), '_') + } + + result <- list( + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema, + cohortDefinitionSchema = cohortDefinitionSchema, # could be removed + tablePrefixCohortDefinitionTables = tablePrefixCohortDefinitionTables, # could be removed + databaseDefinitionSchema = databaseDefinitionSchema, # could be removed + tablePrefixDatabaseDefinitionTables = tablePrefixDatabaseDefinitionTables # could be removed + ) + + class(result) <- 'plpDatabaseResultSchema' + return(result) +} + + +#' Create a list with the database details and database meta data entries +#' @description +#' This function creates a list with the database details and database meta data entries used in the study +#' +#' @details +#' This function is used when inserting database details into the PatientLevelPrediction database results schema +#' +#' @param cdmDatabaseSchemas (string vector) A vector of the cdmDatabaseSchemas used in the study - if the schemas are not unique per database please also specify databaseRefId +#' @param cdmDatabaseNames Sharable names for the databases +#' @param databaseRefIds (string vector) Unique database identifiers - what you specified as cdmDatabaseId in \code{PatientLevelPrediction::createDatabaseDetails()} when developing the models +#' +#' @return +#' Returns a data.frame with the database details +#' +#' @export +createDatabaseList <- function( + cdmDatabaseSchemas, + cdmDatabaseNames, + databaseRefIds = NULL +){ + if(missing(cdmDatabaseSchemas)){ + stop('Need to specify cdmDatabaseSchemas') + } + + if(is.null(databaseRefIds)){ + ParallelLogger::logInfo('No databaseRefId specified so using schema as unique database identifier') + databaseRefIds <- removeInvalidString(cdmDatabaseSchemas) + } + if(missing(cdmDatabaseNames)){ + cdmDatabaseNames <- removeInvalidString(cdmDatabaseSchemas) + } + + + result <- lapply( + 1:length(cdmDatabaseSchemas), + + function(i) list( + databaseDetails = list( + databaseMetaDataId = databaseRefIds[i] + ), + databaseMetaData = list( + databaseId = databaseRefIds[i], + cdmSourceName = cdmDatabaseSchemas[i], + cdmSourceAbbreviation = cdmDatabaseNames[i], + cdmHolder = '', # could get this from CDM_source inside runPlp in future + sourceDesciption = '', + sourceDocumentReference = '', + cdmEtlReference = '', + sourceReleaseDate = '', + cdmReleaseDate = '', + cdmVersion = '', + vocabularyVersion = '', + maxObsPeriodEndDate = '' + ) + ) + ) + + names(result) <- databaseRefIds #cdmDatabaseSchemas + # using id as schema may not be unique + # id uses schema if it is not set + + return(result) +} + + + +#' Function to add the run plp (development or validation) to database +#' @description +#' This function adds a runPlp or external validation result into a database +#' +#' @details +#' This function is used when inserting results into the PatientLevelPrediction database results schema +#' +#' @param runPlp An object of class \code{runPlp} or class \code{externalValidatePlp} +#' @param connectionDetails A connection details created by using the +#' function \code{createConnectionDetails} in the +#' \code{DatabaseConnector} package. +#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables +#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet() +#' @param modelSaveLocation The location of the directory that models will be saved to +#' @param databaseList (Optional) If you want to change the database name then used \code{createDatabaseList} to specify the database settings but use the same cdmDatabaseId was model development/validation +#' +#' @return +#' Returns a data.frame with the database details +#' +#' @export +addRunPlpToDatabase <- function( + runPlp, + connectionDetails, + databaseSchemaSettings, + cohortDefinitions, + modelSaveLocation, + databaseList = NULL +){ + + conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) + on.exit(DatabaseConnector::disconnect(conn)) + + modelDesignId <- insertModelDesignInDatabase( + object = runPlp$model$modelDesign, + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions + ) + + # Add model if runPlp + if(inherits(runPlp, 'runPlp')){ + includesModel <- T + developmentDatabase <- runPlp$model$trainDetails$developmentDatabase + validationDatabase <- runPlp$model$trainDetails$developmentDatabase + developmentDatabaseRefId <- runPlp$model$trainDetails$developmentDatabaseId + validationDatabaseRefId <- runPlp$model$trainDetails$developmentDatabaseId + + populationSettings <- runPlp$model$modelDesign$populationSettings + targetId <- runPlp$model$modelDesign$targetId + outcomeId <- runPlp$model$modelDesign$outcomeId + restrictPlpDataSettings <- runPlp$model$modelDesign$restrictPlpDataSettings + + modelDevelopment <- 1 #added + + attrition <- runPlp$model$trainDetails$attrition + + } else{ + includesModel <- F + developmentDatabase <- runPlp$model$validationDetails$developmentDatabase + validationDatabase <- runPlp$model$validationDetails$validationDatabase + developmentDatabaseRefId <- runPlp$model$validationDetails$developmentDatabaseId + validationDatabaseRefId <- runPlp$model$validationDetails$validationDatabaseId + + populationSettings <- runPlp$model$validationDetails$populationSettings + targetId <- runPlp$model$validationDetails$targetId + outcomeId <- runPlp$model$validationDetails$outcomeId + restrictPlpDataSettings <- runPlp$model$validationDetails$restrictPlpDataSettings + + modelDevelopment <- 0 #added + + attrition <- runPlp$model$validationDetails$attrition + + } + + # Add databases + developmentDatabaseId <- addDatabase( + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + databaseList = databaseList, + databaseSchema = developmentDatabase, + databaseId = developmentDatabaseRefId + ) + + validationDatabaseId <- addDatabase( + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + databaseList = databaseList, + databaseSchema = validationDatabase, + databaseId = validationDatabaseRefId + ) + + + # add nodel if the result contains it + if(includesModel){ + insertModelInDatabase( + model = runPlp$model, + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + databaseId = developmentDatabaseId, + modelDesignId = modelDesignId, + modelSaveLocation = modelSaveLocation + ) + } + + # get validation settings + validationTarId <- addTar( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + startDay = populationSettings$riskWindowStart, + startAnchor = populationSettings$startAnchor, + endDay = populationSettings$riskWindowEnd, + endAnchor = populationSettings$endAnchor, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + validationTargetId <- addCohort( + conn = conn, + resultSchema = databaseSchemaSettings$cohortDefinitionSchema, + targetDialect = databaseSchemaSettings$targetDialect, + cohortDefinition = getCohortDef(cohortDefinitions,targetId), + tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + validationOutcomeId <- addCohort( + conn = conn, + resultSchema = databaseSchemaSettings$cohortDefinitionSchema, + targetDialect = databaseSchemaSettings$targetDialect, + cohortDefinition = getCohortDef(cohortDefinitions,outcomeId), + tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + validationPopulationId <- addPopulationSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = populationSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + validationPlpDataId <- addPlpDataSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = restrictPlpDataSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + # Add performance + insertPerformanceInDatabase( + performanceEvaluation = runPlp$performanceEvaluation, + covariateSummary = runPlp$covariateSummary, + attrition = attrition, + executionDateTime = format(runPlp$executionSummary$ExecutionDateTime, format="%Y-%m-%d"), + plpVersion = runPlp$executionSummary$PackageVersion$packageVersion, + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + + modelDesignId = modelDesignId, + developmentDatabaseId = developmentDatabaseId, + + validationDatabaseId = validationDatabaseId, + validationTarId = validationTarId, + validationPopulationId= validationPopulationId, + validationPlpDataId = validationPlpDataId, + validationTargetId = validationTargetId, + validationOutcomeId = validationOutcomeId, + + modelDevelopment = modelDevelopment + ) + + return(invisible(NULL)) +} + + + +################### +# INSERT MODEL +#################### +insertModelInDatabase <- function( + model, + conn, + databaseSchemaSettings, + databaseId, + modelDesignId, + modelSaveLocation +){ + + # save the model to the file system + modelLocation <- file.path( + modelSaveLocation, + 'models', + paste0('folder-', modelDesignId, '-', databaseId) + ) + if(!dir.exists(modelLocation)){ + dir.create(modelLocation, recursive = T) + } + # savign all the model as preprocessing was too large + # for database + savePlpModel( + plpModel = model, + dirPath = modelLocation + ) + + #saveModelPart( + # model = model$model, + # savetype = attr(model, 'saveType'), + # dirPath = modelLocation + #) + + # need hyperParamSearch for shiny app but the other parts + # are too large to store into the database + trainDetails <- list(hyperParamSearch = model$trainDetails$hyperParamSearch) + + # create this function + modelId <- addModel( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + analysisId = ifelse( + is.null(model$trainDetails$analysisId), + 'missing', + model$trainDetails$analysisId + ), + modelDesignId = modelDesignId, + databaseId = databaseId, + modelType = model$trainDetails$modelName, + plpModelFile = modelLocation, # save the model to a location and add location here + trainDetails = as.character(ParallelLogger::convertSettingsToJson(trainDetails)), + preprocessing = "",#as.character(ParallelLogger::convertSettingsToJson(model$preprocessing)), + + executionDateTime = format(model$trainDetails$trainingDate, format="%Y-%m-%d"), + trainingTime = model$trainDetails$trainingTime, + intercept = ifelse(is.list(model$model) & attr(model, 'saveType') != 'xgboost', model$model$coefficients$betas[1], 0), # using the param useIntercept? + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo( + paste0('modelId: ', modelId, + ' inserted for modelDesignId ', modelDesignId, + 'and databaseId ', databaseId) + ) + + return(invisible(modelId)) +} + +addModel <- function( + conn, + resultSchema, + targetDialect, + tablePrefix, + analysisId, + modelDesignId, + databaseId, + + modelType, + plpModelFile, + trainDetails, + preprocessing, + + executionDateTime, + trainingTime, + intercept, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(is.null(analysisId)){ + stop('analysisId is null') + } + if(is.null(modelDesignId)){ + stop('modelDesignId is null') + } + if(is.null(databaseId)){ + stop('databaseId is null') + } + + if(is.null(plpModelFile)){ + stop('plpModelFile is null') + } + if(is.null(executionDateTime)){ + stop('executionDateTime is null') + } + if(is.null(intercept)){ + stop('intercept is null') + } + + # process json to make it ordered... + # TODO + + result <- checkTable( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'models', + columnNames = c( + 'model_design_id', + 'database_id' + ), + values = c( + modelDesignId, + databaseId + ), + tempEmulationSchema = tempEmulationSchema + ) + + if(nrow(result)==0){ + # model + sql <- "INSERT INTO @my_schema.@string_to_appendmodels( + analysis_id, + model_design_id, + database_id, + model_type, + plp_model_file, + train_details, + preprocessing, + execution_date_time, + training_time, + intercept + ) VALUES + ('@analysis_id', + @model_design_id, + @database_id, + + '@model_type', + '@plp_model_file', + '@train_details', + '@preprocessing', + + '@execution_date_time', + '@training_time', + @intercept + )" + sql <- SqlRender::render( + sql, + my_schema = resultSchema, + analysis_id = analysisId, + model_design_id = modelDesignId, + database_id = databaseId, + + model_type = modelType, + plp_model_file = plpModelFile, + train_details = trainDetails, + preprocessing = preprocessing, + + execution_date_time = executionDateTime, + training_time = trainingTime, + intercept = intercept, + + string_to_append = tablePrefix + ) + sql <- SqlRender::translate( + sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema + ) + DatabaseConnector::executeSql(conn, sql) + + #getId of new + result <- checkTable( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'models', + columnNames = c( + 'model_design_id', + 'database_id' + ), + values = c( + modelDesignId, + databaseId + ), + tempEmulationSchema = tempEmulationSchema + ) + + } + + return(result$modelId[1]) +} + +#====================== +# Helpers +#====================== + +# get a vector with all the result table names +getPlpResultTables <- function(){ + return( + c( + "CALIBRATION_SUMMARY", + "COVARIATE_SUMMARY", + "DEMOGRAPHIC_SUMMARY", + "EVALUATION_STATISTICS", + "PREDICTION_DISTRIBUTION", + "THRESHOLD_SUMMARY", + + "ATTRITION", #new + + "DIAGNOSTIC_SUMMARY", + "DIAGNOSTIC_PARTICIPANTS", + "DIAGNOSTIC_PREDICTORS", + "DIAGNOSTIC_OUTCOMES", + "DIAGNOSTIC_DESIGNS", + + "DIAGNOSTICS", #new + "RECALIBRATIONS", #new + + "PERFORMANCES", + + "MODELS", + + "MODEL_DESIGNS", + + "MODEL_SETTINGS", + "COVARIATE_SETTINGS", + "POPULATION_SETTINGS", + "FEATURE_ENGINEERING_SETTINGS", + "SPLIT_SETTINGS", + "PLP_DATA_SETTINGS", #new + "SAMPLE_SETTINGS", + "TIDY_COVARIATES_SETTINGS", #new + "TARS", + + "DATABASE_DETAILS", + "DATABASE_META_DATA", + "COHORT_DEFINITION", + "COHORTS" + ) + ) +} + +getResultLocations <- function(resultLocation){ + # get the model locations... + + resultLocs <- dir( + resultLocation, + pattern = 'Analysis_', + full.names = T + ) + # automatically find Results folder, to handle both plpResult/ and validationResult/ + resultLocs <- file.path(resultLocs, dir(resultLocs, pattern='Result')) + + + if(dir.exists(file.path(resultLocation, 'Validation'))){ + validationDatabases <- dir(file.path(resultLocation, 'Validation')) + + valLocs <- dir( + unlist( + lapply( + validationDatabases, + function(x) dir(file.path(resultLocation, 'Validation', x), + pattern = 'Analysis_', + full.names = T + ) + ) + ), + full.names = T + ) + + resultLocs <- c(resultLocs, valLocs) + + } + return(resultLocs) + +} + +deleteTables <- function( + conn, + databaseSchema, + targetDialect, + tempEmulationSchema, + tableNames, + tablePrefix +){ + + if(tablePrefix != ''){ + tableNames <- tolower(paste0(gsub('_','',gsub(' ','', tablePrefix)), '_', tableNames)) + } + + alltables <- tolower(DatabaseConnector::getTableNames( + connection = conn, + databaseSchema = databaseSchema + )) + + + for(tb in tableNames){ + if(tb %in% alltables){ + + if(targetDialect != 'sqlite'){ + sql <- 'TRUNCATE TABLE @my_schema.@table' + sql <- SqlRender::render( + sql, + my_schema = databaseSchema, + table = tb + ) + sql <- SqlRender::translate( + sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema + ) + DatabaseConnector::executeSql(conn, sql) + } else{ + sql <- 'DELETE FROM @my_schema.@table' + sql <- SqlRender::render( + sql, + my_schema = databaseSchema, + table = tb + ) + sql <- SqlRender::translate( + sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema + ) + DatabaseConnector::executeSql(conn, sql) + } + sql <- 'DROP TABLE @my_schema.@table' + sql <- SqlRender::render( + sql, + my_schema = databaseSchema, + table = tb + ) + sql <- SqlRender::translate( + sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema + ) + DatabaseConnector::executeSql(conn, sql) + } + + } + +} + + +## Template Helpers + +enc <- function(x){ + return(paste0("'", x, "'")) +} + +cleanNum <- function(x){ + types <- unlist(lapply(1:ncol(x), function(i) class(x[,i]))) + + ids <- which(types%in% c("numeric", "integer" )) + + for(id in ids){ + okVals <- is.finite(x[,id]) + + if(sum(okVals)!=length(okVals)){ + x[!okVals,id] <- NA + } + + } + return(x) +} + +checkTable <- function(conn, + resultSchema, + tablePrefix = '', + targetDialect, + tableName, + columnNames, + values, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + vals <- paste0(paste0(columnNames," = ", values), collapse = " and ") + + sql <- "SELECT * from @my_schema.@string_to_append@table where @input_vals;" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + table = tableName, + input_vals = vals, + string_to_append = tablePrefix) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = T) + + return(result) +} + + +checkJson <- function(conn, + resultSchema, + tablePrefix = '', + targetDialect, + tableName, + jsonColumnName, + id, + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + sql <- "SELECT * from @my_schema.@string_to_append@table;" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + table = tableName, + string_to_append = tablePrefix) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = T) + + resultId <- NULL + if(nrow(result)>0){ + colId <- result[,jsonColumnName] == json + if(sum(colId)>0){ + resultId <- result[colId,id][1] + } + } + + return(resultId) +} + + +getCohortDef <- function(cohortDefinitions, cohortId){ + if(!is.null(cohortDefinitions)){ + if(sum(cohortDefinitions$cohortId == cohortId) > 0){ + return(cohortDefinitions[cohortDefinitions$cohortId == cohortId, ]) + } + } + return( + data.frame( + cohortId = cohortId, + cohortName = paste0('Cohort: ', cohortId), + json = '{}' + ) + ) +} + + +# adds json from package unless json is specified +addCohort <- function( + conn, + resultSchema, + targetDialect, + tablePrefix = '', + cohortDefinition, # this is the R data.frame of the cohortDefinition + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + + # make sure the json has been converted + json <- cohortDefinition$json + if(!inherits(x = json , what = 'character')){ + ParallelLogger::logInfo('converting json to character') + json <- as.character(json) # now convert to character + } + + # reduce the size to save + if(!targetDialect %in% c('sqlite', 'postgres')){ + json <- substr(json, 1, 4000) # TESTING - FIX THIS [TODO] + } + + #check whether cohort already in COHORT_DEFINITION table: + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'cohort_definition', + columnNames = c('cohort_name'), + values = c(paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'")), + tempEmulationSchema = tempEmulationSchema + ) + + addNew <- F + if(nrow(result)>0){ + addNew <- json %in% result$json + ParallelLogger::logInfo(paste0('json in jsons:', addNew)) + } + + if(addNew){ + cohortDefinitionId <- result$cohortDefinitionId[result$json %in% json] + ParallelLogger::logInfo(paste0('Cohort ',gsub('\'', '', cohortDefinition$cohortName),' exists in cohort_definition with cohort id', result$cohortDefinitionId[result$json %in% json])) + } else{ + ParallelLogger::logInfo(paste0('Adding cohort ',gsub('\'', '', cohortDefinition$cohortName))) + + data <- data.frame( + cohortName = gsub('\'', '', cohortDefinition$cohortName), + cohortDefinitionId = cohortDefinition$cohortId, + json = json + ) + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'cohort_definition'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + # now check and get id + result <- checkTable( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'cohort_definition', + columnNames = c('cohort_name', 'cohort_definition_id'), + values = c(paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'"), cohortDefinition$cohortId), + tempEmulationSchema = tempEmulationSchema + ) + + jsonInd <- result$json %in% json + cohortDefinitionId <- result$cohortDefinitionId[jsonInd] + } + + # now add to cohorts table + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'cohorts', + columnNames = c('cohort_definition_id','cohort_name'), + values = c(cohortDefinitionId, paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'")), + tempEmulationSchema = tempEmulationSchema + ) + + if(nrow(result)>0){ + ParallelLogger::logInfo(paste0('Cohort ',gsub('\'', '', cohortDefinition$cohortName),' exists in cohorts with cohort id', result$cohortId)) + } else{ + ParallelLogger::logInfo(paste0('Adding cohort ',gsub('\'', '', cohortDefinition$cohortName))) + + data <- data.frame( + cohortDefinitionId = cohortDefinitionId, + cohortName = gsub('\'', '', cohortDefinition$cohortName) + ) + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'cohorts'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + # now check and get id + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'cohorts', + columnNames = c('cohort_definition_id','cohort_name'), + values = c(cohortDefinitionId, paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'")), + tempEmulationSchema = tempEmulationSchema + ) + } + + return(result$cohortId[1]) +} + + + +addDatabase <- function( + conn, + databaseSchemaSettings, + databaseList = NULL, # list with the database details + databaseId = NULL, # the database id + databaseSchema # the database schema +){ + + if(is.null(databaseId)){ + databaseId <- removeInvalidString(databaseSchema) + } + + # get the database tables for the databaseId + if(is.null(databaseList)){ + databaseDataFrames <- createDatabaseList(cdmDatabaseSchemas = databaseSchema, databaseRefIds = databaseId)[[1]] + } else{ + if(databaseId %in% names(databaseList)){ + databaseDataFrames <- databaseList[[as.character(databaseId)]] + } else{ + ParallelLogger::logInfo('database ID not found in databaseList so added new entry') + databaseDataFrames <- createDatabaseList(cdmDatabaseSchemas = databaseSchema, databaseRefIds = databaseId)[[1]] + } + } + + + # check the database_meta_data + result <- checkTable(conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + tablePrefix = databaseSchemaSettings$tablePrefix, + targetDialect = databaseSchemaSettings$targetDialect, + tableName = 'database_meta_data', + columnNames = c('database_id'), + values = c(paste0("'",databaseDataFrames$databaseMetaData$databaseId,"'")), + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + if(nrow(result)>0){ + ParallelLogger::logInfo(paste0('Database meta data ', databaseDataFrames$database_meta_data$databaseId ,' already exists')) + } else { + + sql <- "INSERT INTO @my_schema.@string_to_appenddatabase_meta_data( + database_id, + cdm_source_name, + cdm_source_abbreviation + ) + VALUES ('@database_id','@cdm_source_name', '@cdm_source_abbreviation');" + sql <- SqlRender::render( + sql, + my_schema = databaseSchemaSettings$resultSchema, + database_id = databaseDataFrames$databaseMetaData$databaseId, + cdm_source_name = databaseDataFrames$databaseMetaData$cdmSourceName, + cdm_source_abbreviation = databaseDataFrames$databaseMetaData$cdmSourceAbbreviation, + string_to_append = databaseSchemaSettings$tablePrefix + ) + sql <- SqlRender::translate(sql, targetDialect = databaseSchemaSettings$targetDialect, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + + result <- checkTable(conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + tablePrefix = databaseSchemaSettings$tablePrefix, + targetDialect = databaseSchemaSettings$targetDialect, + tableName = 'database_meta_data', + columnNames = c('database_id', 'cdm_source_name', + 'cdm_source_abbreviation'), + values = c(paste0("'",databaseDataFrames$databaseMetaData$databaseId,"'"), + paste0("'",databaseDataFrames$databaseMetaData$cdmSourceName,"'"), + paste0("'",databaseDataFrames$databaseMetaData$cdmSourceAbbreviation,"'")), + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + } + + result <- checkTable(conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + tablePrefix = databaseSchemaSettings$tablePrefix, + targetDialect = databaseSchemaSettings$targetDialect, + tableName = 'database_details', + columnNames = c('database_meta_data_id'), + values = c(paste0("'",databaseDataFrames$databaseDetails$databaseMetaDataId,"'") + ), + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + if(nrow(result)>0){ + ParallelLogger::logInfo(paste0('Database', result$databaseId ,' already exists')) + } else { + + sql <- "INSERT INTO @my_schema.@string_to_appenddatabase_details(database_meta_data_id) + VALUES ('@database_meta_data_id');" + sql <- SqlRender::render(sql, + my_schema = databaseSchemaSettings$resultSchema, + database_meta_data_id = databaseDataFrames$databaseDetails$databaseMetaDataId, + string_to_append = databaseSchemaSettings$tablePrefix) + sql <- SqlRender::translate(sql, targetDialect = databaseSchemaSettings$targetDialect, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + + result <- checkTable(conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + tablePrefix = databaseSchemaSettings$tablePrefix, + targetDialect = databaseSchemaSettings$targetDialect, + tableName = 'database_details', + columnNames = c('database_meta_data_id'), + values = c(paste0("'",databaseDataFrames$databaseDetails$databaseMetaDataId,"'") + ), + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + + } + + return(result$databaseId[1]) + +} + + + + + + + + + diff --git a/R/uploadToDatabaseDiagnostics.R b/R/uploadToDatabaseDiagnostics.R new file mode 100644 index 000000000..0d10506e0 --- /dev/null +++ b/R/uploadToDatabaseDiagnostics.R @@ -0,0 +1,369 @@ +#' Insert mutliple diagnosePlp results saved to a directory into a PLP result schema database +#' @description +#' This function inserts diagnosePlp results into the result schema +#' +#' @details +#' This function can be used to upload diagnosePlp results into a database +#' +#' @param connectionDetails A connection details created by using the +#' function \code{createConnectionDetails} in the +#' \code{DatabaseConnector} package. +#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables +#' @param cohortDefinitions (list) A list of cohortDefinitions (each list must contain: name, id) +#' @param databaseList (Optional) ... +#' @param resultLocation The location of the diagnostic results +#' +#' @return +#' Returns NULL but uploads multiple diagnosePlp results into the database schema specified in databaseSchemaSettings +#' +#' @export +addMultipleDiagnosePlpToDatabase <- function( + connectionDetails, + databaseSchemaSettings, + cohortDefinitions, + databaseList = NULL, + resultLocation +){ + + diagnosisFiles <- file.path(resultLocation, dir(resultLocation, pattern = 'Analysis_'), 'diagnosePlp.rds') + + if(length(diagnosisFiles) == 0){ + ParallelLogger::logInfo('No diagnostic results found') + return(NULL) + } + + for(diagnosisFile in diagnosisFiles){ + if(file.exists(diagnosisFile)){ + diagnosePlp <- readRDS(diagnosisFile) + addDiagnosePlpToDatabase( + diagnosePlp = diagnosePlp, + connectionDetails = connectionDetails, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions, + databaseList = databaseList + ) + } + } + return(invisible(NULL)) +} + +#' Insert a diagnostic result into a PLP result schema database +#' @description +#' This function inserts a diagnostic result into the result schema +#' +#' @details +#' This function can be used to upload a diagnostic result into a database +#' +#' @param diagnosePlp An object of class \code{diagnosePlp} +#' @param connectionDetails A connection details created by using the +#' function \code{createConnectionDetails} in the +#' \code{DatabaseConnector} package. +#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables +#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet() +#' @param databaseList (Optional) If you wish to overwrite the settings in the plp object use \code{createdatabaseList} to specify the databases +#' @param overWriteIfExists (default: T) Whether to delete existing results and overwrite them +#' +#' @return +#' Returns NULL but uploads the diagnostic into the database schema specified in databaseSchemaSettings +#' +#' @export +addDiagnosePlpToDatabase <- function( + diagnosePlp, + connectionDetails, + databaseSchemaSettings, + cohortDefinitions, + databaseList = NULL, + overWriteIfExists = T +){ + + conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) + on.exit(DatabaseConnector::disconnect(conn)) + + modelDesignId <- insertModelDesignInDatabase( + object = diagnosePlp$modelDesign, + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions + ) + + databaseId <- addDatabase( + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + databaseList = databaseList, + databaseSchema = diagnosePlp$databaseSchema, + databaseId = diagnosePlp$databaseId + ) + + diagnoseId <- insertDiagnosisToDatabase( + diagnostics = diagnosePlp, + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + modelDesignId = modelDesignId, + databaseId = databaseId, + overWriteIfExists = overWriteIfExists + ) + + return(invisible(diagnoseId)) +} + + +insertDiagnosisToDatabase <- function( + diagnostics, + conn, + databaseSchemaSettings, + modelDesignId, + databaseId, + overWriteIfExists = T +){ + + diagnosticId <- addDiagnostic( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + modelDesignId = modelDesignId, + databaseId = databaseId, + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('diagnosticId: ', diagnosticId)) + + # now add the four tables + + ParallelLogger::logInfo('Adding DiagnosticSummary') + tryCatch({ + addResultTable( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + tableName = 'diagnostic_summary', + resultIdName = 'diagnosticId', + resultId = diagnosticId, + object = diagnostics$summary, + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema, + overWriteIfExists = overWriteIfExists + )}, + error = function(e){ParallelLogger::logError(e);} + ) + + ParallelLogger::logInfo('Adding DiagnosticParticipants') + tryCatch({ + addResultTable( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + tableName = 'diagnostic_participants', + resultIdName = 'diagnosticId', + resultId = diagnosticId, + object = diagnostics$participants, + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema, + overWriteIfExists = overWriteIfExists + )}, + error = function(e){ParallelLogger::logError(e);} + ) + + ParallelLogger::logInfo('Adding DiagnosticPredictors') + tryCatch({ + addResultTable( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + tableName = 'diagnostic_predictors', + resultIdName = 'diagnosticId', + resultId = diagnosticId, + object = diagnostics$predictors, + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema, + overWriteIfExists = overWriteIfExists + )}, + error = function(e){ParallelLogger::logError(e);} + ) + + ParallelLogger::logInfo('Adding DiagnosticOutcomes') + tryCatch({ + addResultTable( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + tableName = 'diagnostic_outcomes', + resultIdName = 'diagnosticId', + resultId = diagnosticId, + object = diagnostics$outcomes, + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema, + overWriteIfExists = overWriteIfExists + )}, + error = function(e){ParallelLogger::logError(e);} + ) + + ParallelLogger::logInfo('Adding DiagnosticDesigns') + tryCatch({ + addResultTable( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + tableName = 'diagnostic_designs', + resultIdName = 'diagnosticId', + resultId = diagnosticId, + object = diagnostics$designs, + + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema, + overWriteIfExists = overWriteIfExists + )}, + error = function(e){ParallelLogger::logError(e);} + ) + + return(invisible(diagnosticId)) +} + + + +addDiagnostic <- function( + conn, + resultSchema, + targetDialect, + + modelDesignId, + databaseId, + + tablePrefix, + tempEmulationSchema +){ + + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'diagnostics', + columnNames = c( + 'model_design_id', + 'database_id' + ), + values = c( + modelDesignId, + databaseId + ), + tempEmulationSchema = tempEmulationSchema + ) + + if(nrow(result)==0){ + # model + sql <- "INSERT INTO @my_schema.@string_to_appenddiagnostics ( + model_design_id, + database_id + ) + VALUES ( + @model_design_id, + @database_id + )" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + model_design_id = modelDesignId, + database_id = databaseId, + string_to_append = tablePrefix) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + + #getId of new + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'diagnostics', + columnNames = c( + 'model_design_id', + 'database_id' + ), + values = c( + modelDesignId, + databaseId + ), + tempEmulationSchema = tempEmulationSchema + ) + + } + + return(result$diagnosticId[1]) +} + +# replace the performance inserts with this single function... +addResultTable <- function( + conn = conn, + resultSchema, + targetDialect, + tableName = 'diagnostic_summary', + resultIdName = 'diagnosticId', + resultId, + object, + tablePrefix, + tempEmulationSchema, + overWriteIfExists = T +){ + + object[resultIdName] <- resultId + + # get column names and check all present in object + columnNames <- getColumnNames( + conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,tableName), + tempEmulationSchema = tempEmulationSchema + ) + isValid <- sum(colnames(object)%in%columnNames) == length(columnNames) + + exists <- checkResultExists( + conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,tableName), + resultIdName = SqlRender::camelCaseToSnakeCase(resultIdName), + resultId = resultId, + tempEmulationSchema = tempEmulationSchema + ) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where @result_id_name = @result_id;" + sql <- SqlRender::render(sql, + result_id_name = SqlRender::camelCaseToSnakeCase(resultIdName), + result_id = resultId, + result_schema = resultSchema, + table_name = paste0(tablePrefix,tableName) + ) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,tableName), + data = as.data.frame(object[,columnNames]), + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema + ) + } + + return(invisible(NULL)) + } diff --git a/R/uploadToDatabaseModelDesign.R b/R/uploadToDatabaseModelDesign.R new file mode 100644 index 000000000..f99decd90 --- /dev/null +++ b/R/uploadToDatabaseModelDesign.R @@ -0,0 +1,995 @@ +#' Insert a model design into a PLP result schema database +#' @description +#' This function inserts a model design and all the settings into the result schema +#' +#' @details +#' This function can be used to upload a model design into a database +#' +#' @param object An object of class modelDesign, runPlp or externalValidatePlp +#' @param conn A connection to a database created by using the +#' function \code{connect} in the +#' \code{DatabaseConnector} package. +#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables +#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet() +#' +#' @return +#' Returns NULL but uploads the model design into the database schema specified in databaseSchemaSettings +#' +#' @export +insertModelDesignInDatabase <- function( + object, + conn, + databaseSchemaSettings, + cohortDefinitions +){ + + # REMOVE THIS + if(inherits(object, 'externalValidatePlp') | inherits(object, 'runPlp')){ + + object <- PatientLevelPrediction::createModelDesign( + targetId = object$model$modelDesign$targetId, + outcomeId = object$model$modelDesign$outcomeId, + restrictPlpDataSettings = object$model$modelDesign$restrictPlpDataSettings, + populationSettings = object$model$modelDesign$populationSettings, + covariateSettings = object$model$modelDesign$covariateSettings, + featureEngineeringSettings = object$model$modelDesign$featureEngineeringSettings, + sampleSettings = object$model$modelDesign$sampleSettings, + preprocessSettings = object$model$modelDesign$preprocessSettings, + modelSettings = object$model$modelDesign$modelSettings, + runCovariateSummary = T + ) + + } + + if(inherits(object, 'modelDesign')){ + modelDesignId <- insertModelDesignSettings( + object = object, + conn = conn, + databaseSchemaSettings = databaseSchemaSettings, + cohortDefinitions = cohortDefinitions + ) + return(modelDesignId) + } + + return(NULL) +} + + +# this function inserts all the settings for the model design +# it returns the model_design_id for the database +insertModelDesignSettings <- function( + object, + conn, + databaseSchemaSettings, + cohortDefinitions +){ + + if(!inherits(x = object, what = 'modelDesign')){ + stop('object in insertModelDesign() is not a modelDesign') + } + + + # add TAR + tarId <- addTar( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + startDay = object$populationSettings$riskWindowStart, + startAnchor = object$populationSettings$startAnchor, + endDay = object$populationSettings$riskWindowEnd, + endAnchor = object$populationSettings$endAnchor, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('tarId: ', tarId)) + + tId <- addCohort( + conn = conn, + resultSchema = databaseSchemaSettings$cohortDefinitionSchema, + targetDialect = databaseSchemaSettings$targetDialect, + cohortDefinition = getCohortDef(cohortDefinitions, object$targetId), + tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('tId: ', tId)) + + oId <- addCohort( + conn = conn, + resultSchema = databaseSchemaSettings$cohortDefinitionSchema, + targetDialect = databaseSchemaSettings$targetDialect, + cohortDefinition = getCohortDef(cohortDefinitions, object$outcomeId), + tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('oId: ', oId)) + + popSetId <- addPopulationSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$populationSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('popSetId: ', popSetId)) + + covSetId <- addCovariateSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$covariateSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('covSetId: ', covSetId)) + + modSetId <- addModelSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + modelType = attr(object$modelSettings$param, 'settings')$modelType, # make this the same as model$trainDetails$modelName? + json = object$modelSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('modSetId: ', modSetId)) + + # NEW: add plp_data_settings + plpDataSetId <- addPlpDataSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$restrictPlpDataSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('plpDataSetId: ', plpDataSetId)) + + # NEW: add FE_settings + FESetId <- addFESetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$featureEngineeringSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('FESetId: ', FESetId)) + + # NEW: add sample_settings + sampleSetId <- addSampleSetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$sampleSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('sampleSetId: ', sampleSetId)) + + # NEW: add tidy_covariate_settings + tidySetId <- addTidySetting( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$preprocessSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('tidySetId: ', tidySetId)) + + + # this is now split setting - update this function + splitId <- addSplitSettings( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + json = object$splitSettings, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('splitId: ', splitId)) + + # create this function + modelDesignId <- addModelDesign( # need to create + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + targetId = tId, + outcomeId = oId, + tarId = tarId, + plpDataSettingId = plpDataSetId, + populationSettingId = popSetId, + modelSettingId = modSetId, + covariateSettingId = covSetId, + sampleSettingId = sampleSetId, + splitSettingId = splitId, # changed from trainingId + featureEngineeringSettingId = FESetId, + tidyCovariatesSettingId = tidySetId, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('modelDesignId: ', modelDesignId)) + + return(modelDesignId) +} + +addModelDesign <- function( + conn, + resultSchema, targetDialect, + tablePrefix = tablePrefix, + targetId, + outcomeId, + tarId, + plpDataSettingId, + populationSettingId, + modelSettingId, + covariateSettingId, + sampleSettingId, + splitSettingId, + featureEngineeringSettingId, + tidyCovariatesSettingId, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(is.null(targetId)){ + stop('targetId is null') + } + if(is.null(outcomeId)){ + stop('outcomeId is null') + } + if(is.null(tarId)){ + stop('tarId is null') + } + + if(is.null(plpDataSettingId)){ + stop('plpDataSettingId is null') + } + if(is.null(populationSettingId)){ + stop('populationSettingId is null') + } + if(is.null(modelSettingId)){ + stop('modelSettingId is null') + } + if(is.null(covariateSettingId)){ + stop('covariateSettingId is null') + } + if(is.null(sampleSettingId)){ + stop('sampleSettingId is null') + } + if(is.null(splitSettingId)){ + stop('splitSettingId is null') + } + if(is.null(featureEngineeringSettingId)){ + stop('featureEngineeringSettingId is null') + } + if(is.null(tidyCovariatesSettingId)){ + stop('tidyCovariatesSettingId is null') + } + + # process json to make it ordered... + # TODO + + result <- checkTable( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'model_designs', + columnNames = c( + 'target_id', + 'outcome_id', + 'tar_id', + 'plp_data_setting_id', + 'population_setting_id', + 'model_setting_id', + 'covariate_setting_id', + 'sample_setting_id', + 'split_setting_id', + 'feature_engineering_setting_id', + 'tidy_covariates_setting_id' + ), + values = c( + targetId, + outcomeId, + tarId, + plpDataSettingId, + populationSettingId, + modelSettingId, + covariateSettingId, + sampleSettingId, + splitSettingId, + featureEngineeringSettingId, + tidyCovariatesSettingId + ), + tempEmulationSchema = tempEmulationSchema + ) + + if(nrow(result)==0){ + # model + sql <- "INSERT INTO @my_schema.@string_to_appendmodel_designs( + target_id, + outcome_id, + tar_id, + plp_data_setting_id, + population_setting_id, + model_setting_id, + covariate_setting_id, + sample_setting_id, + split_setting_id, + feature_engineering_setting_id, + tidy_covariates_setting_id + ) VALUES + ( + @target_id, + @outcome_id, + @tar_id, + @plp_data_setting_id, + @population_setting_id, + @model_setting_id, + @covariate_setting_id, + @sample_setting_id, + @split_setting_id, + @feature_engineering_setting_id, + @tidy_covariates_setting_id + )" + sql <- SqlRender::render( + sql, + my_schema = resultSchema, + target_id = targetId, + outcome_id = outcomeId, + tar_id = tarId, + plp_data_setting_id= plpDataSettingId, + population_setting_id = populationSettingId, + model_setting_id = modelSettingId, + covariate_setting_id = covariateSettingId, + sample_setting_id = sampleSettingId, + split_setting_id = splitSettingId, + feature_engineering_setting_id = featureEngineeringSettingId, + tidy_covariates_setting_id = tidyCovariatesSettingId, + string_to_append = tablePrefix + ) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + + #getId of new + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'model_designs', + columnNames = c( + 'target_id', + 'outcome_id', + 'tar_id', + 'plp_data_setting_id', + 'population_setting_id', + 'model_setting_id', + 'covariate_setting_id', + 'sample_setting_id', + 'split_setting_id', + 'feature_engineering_setting_id', + 'tidy_covariates_setting_id' + ), + values = c(targetId, + outcomeId, + tarId, + plpDataSettingId, + populationSettingId, + modelSettingId, + covariateSettingId, + sampleSettingId, + splitSettingId, + featureEngineeringSettingId, + tidyCovariatesSettingId + ), + tempEmulationSchema = tempEmulationSchema + ) + + } + + return(result$modelDesignId[1]) +} + + +addTar <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + startDay, + startAnchor, + endDay, + endAnchor, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'tars', + columnNames = c('tar_start_day', 'tar_start_anchor', + 'tar_end_day', 'tar_end_anchor'), + values = c(startDay, + paste0("'",startAnchor,"'"), + endDay, + paste0("'",endAnchor,"'")), + tempEmulationSchema = tempEmulationSchema + ) + + if(nrow(result)==0){ + + ParallelLogger::logInfo('Adding TAR') + # tars - id 1 + sql <- "INSERT INTO @my_schema.@string_to_appendtars(tar_start_day, tar_start_anchor, + tar_end_day, tar_end_anchor) + VALUES (@tar_start_day, @tar_start_anchor, @tar_end_day, @tar_end_anchor);" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + tar_start_day = startDay, + tar_start_anchor = paste0("'",startAnchor,"'"), + tar_end_day = endDay, + tar_end_anchor = paste0("'",endAnchor,"'"), + string_to_append = tablePrefix) + + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + + DatabaseConnector::executeSql(conn, sql) + + #getId of new + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'tars', + columnNames = c('tar_start_day', 'tar_start_anchor', + 'tar_end_day', 'tar_end_anchor'), + values = c(startDay, + paste0("'",startAnchor,"'"), + endDay, + paste0("'",endAnchor,"'")), + tempEmulationSchema = tempEmulationSchema + ) + + } else { + ParallelLogger::logInfo('TAR exists') + } + + + return(result$tarId[1]) + +} + +addPopulationSetting <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + # process json to make it ordered... + # make sure the json has been converted + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'population_settings', + jsonColumnName = 'populationSettingsJson', + id = 'populationSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + if(is.null(jsonId)){ + ParallelLogger::logInfo('Adding new population settings') + + data <- data.frame(populationSettingsJson = json) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'population_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'population_settings', + jsonColumnName = 'populationSettingsJson', + id = 'populationSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + } else{ + ParallelLogger::logInfo('Population settings exists') + } + + return(jsonId) +} + + +addCovariateSetting <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + # process json to make it ordered... + # make sure the json has been converted + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'covariate_settings', + jsonColumnName = 'covariateSettingsJson', + id = 'covariateSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new covariate settings') + + data <- data.frame(covariateSettingsJson = json) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'covariate_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'covariate_settings', + jsonColumnName = 'covariateSettingsJson', + id = 'covariateSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + } else{ + ParallelLogger::logInfo('Covariate setting exists') + } + + return(jsonId) +} + + +addModelSetting <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + modelType, json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + # process json to make it ordered... + # make sure the json has been converted + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'model_settings', + jsonColumnName = 'modelSettingsJson', + id = 'modelSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new model settings') + + data <- data.frame(modelType = modelType, + modelSettingsJson = json) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'model_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema) + + #getId of new + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'model_settings', + jsonColumnName = 'modelSettingsJson', + id = 'modelSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + } else{ + ParallelLogger::logInfo('Model setting exists') + } + + return(jsonId) +} + +addTidySetting <- function( + conn, + resultSchema, + targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'tidy_covariates_settings', + jsonColumnName = 'tidyCovariatesSettingsJson', + id = 'tidyCovariatesSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new tidy covariates settings') + + data <- data.frame( + tidyCovariatesSettingsJson = json + ) + + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'tidy_covariates_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'tidy_covariates_settings', + jsonColumnName = 'tidyCovariatesSettingsJson', + id = 'tidyCovariatesSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + } else{ + ParallelLogger::logInfo('tidy covariates setting exists') + } + + return(jsonId) + +} + +addSampleSetting <- function( + conn, + resultSchema, + targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'sample_settings', + jsonColumnName = 'sampleSettingsJson', + id = 'sampleSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new sample settings') + + data <- data.frame( + sampleSettingsJson = json + ) + + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'sample_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'sample_settings', + jsonColumnName = 'sampleSettingsJson', + id = 'sampleSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + } else{ + ParallelLogger::logInfo('sample setting exists') + } + + return(jsonId) + +} + +addPlpDataSetting <- function( + conn, + resultSchema, + targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'plp_data_settings', + jsonColumnName = 'plpDataSettingsJson', + id = 'plpDataSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new plp data settings') + + data <- data.frame( + plpDataSettingsJson = json + ) + + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'plp_data_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'plp_data_settings', + jsonColumnName = 'plpDataSettingsJson', + id = 'plpDataSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + } else{ + ParallelLogger::logInfo('Split setting exists') + } + + return(jsonId) + +} + +addFESetting <- function( + conn, + resultSchema, + targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'feature_engineering_settings', + jsonColumnName = 'featureEngineeringSettingsJson', + id = 'featureEngineeringSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new feature_engineering settings') + + data <- data.frame( + featureEngineeringSettingsJson = json + ) + + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'feature_engineering_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'feature_engineering_settings', + jsonColumnName = 'featureEngineeringSettingsJson', + id = 'featureEngineeringSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + } else{ + ParallelLogger::logInfo('feature engineering setting exists') + } + + return(jsonId) + +} + +addSplitSettings <- function( + conn, + resultSchema, + targetDialect, + tablePrefix = '', + json, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + if(!inherits(x = json, what = 'character')){ + json <- orderJson(json) # to ensure attributes are alphabetic order + json <- ParallelLogger::convertSettingsToJson(json) + json <- as.character(json) # now convert to character + } + + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'split_settings', + jsonColumnName = 'splitSettingsJson', + id = 'splitSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + if(is.null(jsonId)){ + + ParallelLogger::logInfo('Adding new split settings') + + data <- data.frame( + splitSettingsJson = json + ) + + DatabaseConnector::insertTable( + connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix, 'split_settings'), + data = data, + dropTableIfExists = F, + createTable = F, + tempTable = F, + progressBar = T, + camelCaseToSnakeCase = T, + tempEmulationSchema = tempEmulationSchema + ) + + #getId of new + jsonId <- checkJson( + conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'split_settings', + jsonColumnName = 'splitSettingsJson', + id = 'splitSettingId', + json = json, + tempEmulationSchema = tempEmulationSchema + ) + + } else{ + ParallelLogger::logInfo('Split setting exists') + } + + return(jsonId) + +} + + + +# the ParallelLogger conversion orders attributes - use this for consistency +orderJson <- function(x){ +newx <- ParallelLogger::convertJsonToSettings( + ParallelLogger::convertSettingsToJson( + x + ) + ) + +return(newx) +} diff --git a/R/uploadToDatabasePerformance.R b/R/uploadToDatabasePerformance.R new file mode 100644 index 000000000..4b36af84c --- /dev/null +++ b/R/uploadToDatabasePerformance.R @@ -0,0 +1,835 @@ +insertPerformanceInDatabase <- function( + performanceEvaluation = NULL, + covariateSummary = NULL, + attrition = NULL, + executionDateTime, + plpVersion, + + conn, + databaseSchemaSettings, + + modelDesignId, + developmentDatabaseId, + + validationDatabaseId, + validationTarId, + validationPopulationId, + validationPlpDataId, + validationTargetId, + validationOutcomeId, + + modelDevelopment #added + +){ + + ParallelLogger::logInfo(paste0('Inserting performance...')) + + # add the results + performanceId <- addPerformance( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + + modelDesignId = modelDesignId, + developmentDatabaseId = developmentDatabaseId, + + validationDatabaseId = validationDatabaseId, + validationTargetId = validationTargetId, + validationOutcomeId = validationOutcomeId, + validationTarId = validationTarId, + validationPlpDataId = validationPlpDataId, + validationPopulationId = validationPopulationId, + + modelDevelopment = modelDevelopment, + executionDateTime = executionDateTime, + plpVersion = plpVersion, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + ParallelLogger::logInfo(paste0('performanceId: ', performanceId)) + + # add attrition + if(!is.null(attrition)){ + addAttrition( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + performanceId = performanceId, + attrition = attrition, + overWriteIfExists = T, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + } + + # add performance + #============= + if(!is.null(performanceEvaluation)){ + addEvaluation( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + performanceId = performanceId, + performanceEvaluation = performanceEvaluation, + overWriteIfExists = T, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + } + + if(!is.null(covariateSummary)){ + addCovariateSummary( + conn = conn, + resultSchema = databaseSchemaSettings$resultSchema, + targetDialect = databaseSchemaSettings$targetDialect, + performanceId = performanceId, + covariateSummary = covariateSummary, + restrictToIncluded = T, + overWriteIfExists = T, + tablePrefix = databaseSchemaSettings$tablePrefix, + tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema + ) + } + + return(invisible(performanceId)) + +} + + +addPerformance <- function( + conn, + resultSchema, + targetDialect, + + modelDesignId, + developmentDatabaseId, + + validationDatabaseId, + validationTargetId, + validationOutcomeId, + validationTarId, + validationPlpDataId, + validationPopulationId, + + modelDevelopment, + executionDateTime, + plpVersion, + tablePrefix, + tempEmulationSchema + +){ + + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'performances', + columnNames = c( + 'model_design_id', + 'development_database_id', + + 'validation_database_id', + 'target_id', + 'outcome_id', + 'tar_id', + 'plp_data_setting_id', + 'population_setting_id', + 'model_development' + ), + values = c( + modelDesignId, + developmentDatabaseId, + + validationDatabaseId, + validationTargetId, + validationOutcomeId, + validationTarId, + validationPlpDataId, + validationPopulationId, + modelDevelopment + ), + tempEmulationSchema = tempEmulationSchema + ) + + if(nrow(result)==0){ + # model + sql <- "INSERT INTO @my_schema.@string_to_appendperformances ( + model_design_id, + development_database_id, + + validation_database_id, + target_id, + outcome_id, + tar_id, + plp_data_setting_id, + population_setting_id, + + model_development, + + execution_date_time, + plp_version + ) + VALUES ( + @model_design_id, @development_database_id, + @validation_database_id, @validation_target_id, @validation_outcome_id, @validation_tar_id, + @validation_plp_data_setting_id, @validation_population_setting_id, + @model_development, + '@execution_date_time', '@plp_version')" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + model_design_id = modelDesignId, + development_database_id = developmentDatabaseId, + + validation_database_id = validationDatabaseId, + validation_target_id = validationTargetId, + validation_outcome_id = validationOutcomeId, + validation_tar_id = validationTarId, + validation_plp_data_setting_id = validationPlpDataId, + validation_population_setting_id = validationPopulationId, + + model_development = modelDevelopment, + + execution_date_time = executionDateTime, + plp_version = plpVersion, + string_to_append = tablePrefix) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + + #getId of new + result <- checkTable(conn = conn, + resultSchema = resultSchema, + tablePrefix = tablePrefix, + targetDialect = targetDialect, + tableName = 'performances', + columnNames = c( + 'model_design_id', + 'development_database_id', + + 'validation_database_id', + 'target_id', + 'outcome_id', + 'tar_id', + 'plp_data_setting_id', + 'population_setting_id', + + 'model_development' + ), + values = c( + modelDesignId, + developmentDatabaseId, + + validationDatabaseId, + validationTargetId, + validationOutcomeId, + validationTarId, + validationPlpDataId, + validationPopulationId, + + modelDevelopment + ), + tempEmulationSchema = tempEmulationSchema + ) + + } + + return(result$performanceId[1]) +} + +# attrition +addAttrition <- function( + conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + attrition, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + value <- attrition + if(is.null(value)){ + return(NULL) + } + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + + value$performanceId <- performanceId + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'attrition'), + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'attrition'), + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + performance_id = performanceId, + result_schema = resultSchema, + table_name = paste0(tablePrefix,'attrition')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting attrition for performance ',performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'attrition'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + + +# evals +addEvaluation <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + performanceEvaluation, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + ParallelLogger::logInfo('Adding PredictionDistribution') + tryCatch({addPredictionDistribution(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, + tablePrefix = tablePrefix, + performanceId = performanceId, + performanceEvaluation = performanceEvaluation, + overWriteIfExists = overWriteIfExists, + tempEmulationSchema = tempEmulationSchema)}, + error = function(e){ParallelLogger::logError(e);}) + + ParallelLogger::logInfo('Adding ThresholdSummary') + tryCatch({addThresholdSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, + tablePrefix = tablePrefix, + performanceId = performanceId, + performanceEvaluation = performanceEvaluation, + overWriteIfExists = overWriteIfExists, + tempEmulationSchema = tempEmulationSchema)}, + error = function(e){ParallelLogger::logError(e);}) + + ParallelLogger::logInfo('Adding EvaluationStatistics') + tryCatch({addEvaluationStatistics(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, + tablePrefix = tablePrefix, + performanceId = performanceId, + performanceEvaluation = performanceEvaluation, + overWriteIfExists = overWriteIfExists, + tempEmulationSchema = tempEmulationSchema)}, + error = function(e){ParallelLogger::logError(e);}) + + ParallelLogger::logInfo('Adding CalibrationSummary') + tryCatch({addCalibrationSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, + tablePrefix = tablePrefix, + performanceId = performanceId, + performanceEvaluation = performanceEvaluation, + overWriteIfExists = overWriteIfExists, + tempEmulationSchema = tempEmulationSchema)}, + error = function(e){ParallelLogger::logError(e);}) + + ParallelLogger::logInfo('Adding DemographicSummary') + tryCatch({addDemographicSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect, + tablePrefix = tablePrefix, + performanceId = performanceId, + performanceEvaluation = performanceEvaluation, + overWriteIfExists = overWriteIfExists, + tempEmulationSchema = tempEmulationSchema)}, + error = function(e){ParallelLogger::logError(e);}) + + return(invisible(NULL)) + +} + +addPredictionDistribution <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + performanceEvaluation, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + value <- performanceEvaluation$predictionDistribution + if(is.null(value)){ + return(NULL) + } + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + if(sum(colnames(value)=='class')>0){ + colnames(value)[colnames(value)=='class'] <- 'classLabel' + } + + value$performanceId <- performanceId + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'prediction_distribution'), + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'prediction_distribution'), + resultIdName = 'performance_id', + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + performance_id = performanceId, + result_schema = resultSchema, + table_name = paste0(tablePrefix,'prediction_distribution')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting predictionDistribution for performance ', performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'prediction_distribution'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + +addThresholdSummary <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + performanceEvaluation, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + + value <- performanceEvaluation$thresholdSummary + if(is.null(value)){ + return(NULL) + } + + # check numerical columns: + value <- cleanNum(value) + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + value$performanceId <- performanceId + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tablePrefix = tablePrefix, + tableName = 'threshold_summary', + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'threshold_summary'), + resultIdName = 'performance_id', + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + result_schema = resultSchema, + performance_id = performanceId, + table_name = paste0(tablePrefix,'threshold_summary')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting thresholdSummary for performance ',performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'threshold_summary'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + + +addCalibrationSummary <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + performanceEvaluation, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + + value <- performanceEvaluation$calibrationSummary + if(is.null(value)){ + return(NULL) + } + + # check numerical columns: + value <- cleanNum(value) + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + + value$performanceId <- performanceId + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tablePrefix = tablePrefix, + tableName = 'calibration_summary', + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'calibration_summary'), + resultIdName = 'performance_id', + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + result_schema = resultSchema, + performance_id= performanceId, + table_name = paste0(tablePrefix,'calibration_summary')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting calibrationSummary for performance ', performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'calibration_summary'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + +addEvaluationStatistics <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + performanceEvaluation, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + + value <- data.frame( + evaluation = unlist(performanceEvaluation$evaluationStatistics$evaluation), + metric = unlist(performanceEvaluation$evaluationStatistics$metric), + value = as.numeric(unlist(performanceEvaluation$evaluationStatistics$value)) + ) + + if(is.null(value)){ + return(NULL) + } + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + value$performanceId <- performanceId + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tablePrefix = tablePrefix, + tableName = 'evaluation_statistics', + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'evaluation_statistics'), + resultIdName = 'performance_id', + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + result_schema = resultSchema, + performance_id = performanceId, + table_name = paste0(tablePrefix,'evaluation_statistics')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting evaluationSummary for performance ',performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'evaluation_statistics'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + +addDemographicSummary <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + performanceEvaluation, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + + value <- performanceEvaluation$demographicSummary + if(is.null(value)){ + return(NULL) + } + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + #if(sum(colnames(value)=="p50PredictedProbability")>0){ + # colnames(value)[colnames(value)=="p50PredictedProbability"] <- 'medianPredictedProbability' + #} + + value$performanceId <- performanceId + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tablePrefix = tablePrefix, + tableName = 'demographic_summary', + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'demographic_summary'), + resultIdName = 'performance_id', + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + result_schema = resultSchema, + performance_id = performanceId, + table_name = paste0(tablePrefix,'demographic_summary')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting demographicSummary for performance ',performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'demographic_summary'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + +addCovariateSummary <- function(conn, resultSchema, targetDialect, + tablePrefix = '', + performanceId, + covariateSummary, + restrictToIncluded = T, + overWriteIfExists = T, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){ + + + value <- covariateSummary + if(is.null(value)){ + return(NULL) + } + + # edit names + firstLower <- function(x) { + substr(x, 1, 1) <- tolower(substr(x, 1, 1)) + return(x) + } + colnames(value) <- sapply(colnames(value), firstLower ) + value$performanceId <- performanceId + # remove _ from names + colnames(value) <- gsub('_','', colnames(value)) + + if(restrictToIncluded){ + ParallelLogger::logInfo('Restricting to covariates included in model') + value <- value[value$covariateValue!=0 & !is.na(value$covariateValue),] + } + + # get column names and check all present in object + columnNames <- getColumnNames(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tablePrefix = tablePrefix, + tableName = 'covariate_summary', + tempEmulationSchema = tempEmulationSchema) + isValid <- sum(colnames(value)%in%columnNames) == length(columnNames) + + exists <- checkResultExists(conn = conn, + resultSchema = resultSchema, + targetDialect = targetDialect, + tableName = paste0(tablePrefix,'covariate_summary'), + resultIdName = 'performance_id', + resultId = performanceId, + tempEmulationSchema = tempEmulationSchema) + + if(isValid && (!exists || overWriteIfExists)){ + + # REMOVE existing result + if(exists){ + ParallelLogger::logTrace('Removing existing covariateSummary') + sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;" + sql <- SqlRender::render(sql, + result_schema = resultSchema, + performance_id = performanceId, + table_name = paste0(tablePrefix,'covariate_summary')) + sql <- SqlRender::translate(sql, + targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + DatabaseConnector::executeSql(conn, sql) + } + + # add + ParallelLogger::logInfo(paste0('Inserting covariateSummary for result ', performanceId)) + DatabaseConnector::insertTable(connection = conn, + databaseSchema = resultSchema, + tableName = paste0(tablePrefix,'covariate_summary'), + data = value[,columnNames], + dropTableIfExists = F, createTable = F, tempTable = F, + bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T, + tempEmulationSchema = tempEmulationSchema) + } + + return(invisible(NULL)) +} + + + +#==================== +# Helpers +#==================== + +# gets the column names in camelCase of a table +getColumnNames <- function(conn, resultSchema, targetDialect, tableName, tablePrefix = '', + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + sql <- "select top 1 * from @my_schema.@string_to_append@table;" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + table = tableName, + string_to_append = tablePrefix) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = T) + + return(colnames(result)) +} + +# True/False check whether results exist in table +checkResultExists <- function(conn, resultSchema, targetDialect, + snakeCaseToCamelCase, + tableName, + resultIdName = 'performance_id', + resultId, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") +){ + + sql <- "select * from @my_schema.@table where @result_id_name = @result_id;" + sql <- SqlRender::render(sql, + my_schema = resultSchema, + table = tableName, + result_id_name = resultIdName, + result_id = resultId) + sql <- SqlRender::translate(sql, targetDialect = targetDialect, + tempEmulationSchema = tempEmulationSchema) + result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = T) + return(nrow(result)>0) +} \ No newline at end of file diff --git a/README.md b/README.md index 5ee60d9f6..2126ec474 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,11 @@ Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. [Design and implementati The figure below illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time. -![](vignettes/Figure1.png) +![](vignettes/Figure1.webp) To define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented below (T=green, O=red). -![](vignettes/problems.png) +![](vignettes/problems.webp) Features ======== @@ -30,16 +30,18 @@ Features - Allows for multiple prediction design options. - Extracts the necessary data from a database in OMOP Common Data Model format for multiple covariate settings. - Uses a large set of covariates including for example all drugs, diagnoses, procedures, as well as age, comorbidity indexes, and custom covariates. -- Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network and Deep learning (Convolutional neural networks, Recurrent neural network and Deep nets). +- Allows you to add custom covariates or cohort covariates. +- Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network, AdaBoost and Support vector machines. - Allows you to add custom algorithms. +- Allows you to add custom feature engineering +- Allows you to add custom under/over sampling (or any other sampling) [note: based on existing research this is not recommended] - Contains functionality to externally validate models. - Includes functions to plot and explore model performance (ROC + Calibration). -- Includes a shiny app to interactively view and explore results. -- Implements existing models. -- Builds ensemble models. -- Builds Deep Learning models. +- Build ensemble models using EnsemblePatientLevelPrediction. +- Build Deep Learning models using DeepPatientLevelPrediction. - Generates learning curves. -- Automatically creates a word document containing all the study results. +- Includes a shiny app to interactively view and explore results. +- In the shiny app you can create a html file document (report or protocol) containing all the study results. Screenshots @@ -49,11 +51,11 @@ Screenshots -Calibration plot +Calibration plot -ROC plot +ROC plot @@ -64,20 +66,19 @@ Screenshots Demo of the Shiny Apps can be found here: -- [Single Prediction Viewer Shiny App](http://data.ohdsi.org/PredictionViewer/) - [Multiple Prediction Viewer Shiny App](http://data.ohdsi.org/smokingPhenotypeExplorer/) Technology ========== -PatientLevelPrediction is an R package, with some functions implemented in C++ and python. +PatientLevelPrediction is an R package, with some functions using python through reticulate. System Requirements =================== -Requires R (version 3.3.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in PatientLevelPrediction require Java and Python. +Requires R (version 4.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in PatientLevelPrediction require Java and Python. The python installation is required for some of the machine learning algorithms. We advise to -install Python 3.7 using Anaconda (https://www.continuum.io/downloads). +install Python 3.8 or higher using Anaconda (https://www.continuum.io/downloads). Getting Started =============== @@ -99,8 +100,8 @@ In addition we have created vignettes that describe advanced functionality in mo - [Building Multiple Patient-Level Predictive Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) - [Implementing Existing Patient-Level Predictive Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/ImplementingExistingModels.pdf) - [Adding Custom Machine Learning Algorithms](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomAlgorithms.pdf) -- [Building Deep Learning Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingDeepLearningModels.pdf) -- [Building Ensemble Models](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf) +- [Building Deep Learning Models](https://github.com/OHDSI/DeepPatientLevelPrediction) +- [Building Ensemble Models](https://github.com/OHDSI/EnsemblePatientLevelPrediction) - [Creating Learning Curves](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/CreatingLearningCurves.pdf) Package manual: [PatientLevelPrediction.pdf](https://github.com/OHDSI/PatientLevelPrediction/blob/main/extras/PatientLevelPrediction.pdf) @@ -128,10 +129,9 @@ Development =========== PatientLevelPrediction is being developed in R Studio. -Beta # Acknowledgements - The package is maintained by Jenna Reps and Peter Rijnbeek and has been developed with major contributions from Martijn Schuemie, Patrick Ryan, and Marc Suchard. -- We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins. +- We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins, Egill Fridgeirsson, Alex Rekkas - This project is supported in part through the National Science Foundation grant IIS 1251151. diff --git a/_pkgdown.yml b/_pkgdown.yml index 8e3b50682..763c23504 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,7 +1,11 @@ template: params: bootswatch: cosmo - + +development: + mode: auto + development: docs/dev + home: links: - text: Ask a question @@ -16,7 +20,10 @@ navbar: - reference - articles - tutorial + - benchmarks + - predictors - bestpractice + - clinicalmodels - news right: [hades, github] components: @@ -35,6 +42,15 @@ navbar: bestpractice: text: Best Practices href: articles/BestPractices.html + clinicalmodels: + text: Clinical Models + href: articles/ClinicalModels.html + benchmarks: + text: Benchmarks + href: articles/BenchmarkTasks.html + predictors: + text: Predictors + href: articles/ConstrainedPredictors.html news: text: Changelog href: news/index.html @@ -57,7 +73,8 @@ reference: - createRestrictPlpDataSettings - getPlpData - savePlpData - - loadPlpData + - loadPlpData + - getCohortCovariateData - title: "Settings for designing a prediction models" desc: > Design settings required when developing a model. @@ -67,12 +84,29 @@ reference: - createSampleSettings - createFeatureEngineeringSettings - createPreprocessSettings + - title: "Optional design settings" + desc: > + Settings for optional steps that can be used in the PLP pipeline + contents: + - createCohortCovariateSettings + - createRandomForestFeatureSelection + - createUnivariateFeatureSelection + - createSplineSettings + - createStratifiedImputationSettings + - title: "External validation" + contents: + - createValidationDesign + - validateExternal + - createValidationSettings + - recalibratePlp + - recalibratePlpRefit - title: "Execution settings when developing a model" desc: > Execution settings required when developing a model. contents: - createLogSettings - createExecuteSettings + - createDefaultExecuteSettings - title: "Binary Classification Models" desc: > Functions for setting binary classifiers and their hyper-parameter search. @@ -86,6 +120,8 @@ reference: - setNaiveBayes - setRandomForest - setSVM + - setIterativeHardThresholding + - setLightGBM - title: "Survival Models" desc: > Functions for setting survival models and their hyper-parameter search. @@ -101,6 +137,7 @@ reference: - loadPlpModel - savePlpResult - loadPlpResult + - diagnosePlp - title: "Multiple Patient-Level Prediction Models" desc: > Functions for training mutliple patient-level-prediction model in an efficient way. @@ -110,12 +147,34 @@ reference: - validateMultiplePlp - savePlpAnalysesJson - loadPlpAnalysesJson + - diagnoseMultiplePlp + - title: "Individual pipeline functions" + desc: > + Functions for running parts of the PLP workflow + contents: + - createStudyPopulation + - splitData + - preprocessData + - fitPlp + - predictPlp + - evaluatePlp + - covariateSummary - title: "Saving results into database" desc: > Functions for saving the prediction model and performances into a database. contents: + - insertResultsToSqlite - createPlpResultTables - - populatePlpResultTables + - addMultipleRunPlpToDatabase + - addRunPlpToDatabase + - createDatabaseSchemaSettings + - createDatabaseList + - addDiagnosePlpToDatabase + - addMultipleDiagnosePlpToDatabase + - extractDatabaseToCsv + - insertCsvToDatabase + - insertModelDesignInDatabase + - migrateDataModel - title: "Shiny Viewers" desc: > Functions for viewing results via a shiny app @@ -140,6 +199,7 @@ reference: - plotPreferencePDF - plotPredictionDistribution - plotVariableScatterplot + - outcomeSurvivalPlot - title: "Learning Curves" desc: > Functions for creating and plotting learning curves @@ -151,3 +211,67 @@ reference: Functions for simulating cohort method data objects. contents: - simulatePlpData + - plpDataSimulationProfile + - title: "Data manipulation functions" + desc: > + Functions for manipulating data + contents: + - toSparseM + - MapIds + - title: "Helper/utility functions" + contents: + - listAppend + - listCartesian + - createTempModelLoc + - configurePython + - setPythonEnvironment + - title: "Evaluation measures" + contents: + - accuracy + - averagePrecision + - brierScore + - calibrationLine + - computeAuc + - f1Score + - falseDiscoveryRate + - falseNegativeRate + - falseOmissionRate + - falsePositiveRate + - ici + - modelBasedConcordance + - negativeLikelihoodRatio + - negativePredictiveValue + - positiveLikelihoodRatio + - positivePredictiveValue + - sensitivity + - specificity + - computeGridPerformance + - diagnosticOddsRatio + - getCalibrationSummary + - getDemographicSummary + - getThresholdSummary + - getThresholdSummary_binary + - getPredictionDistribution + - getPredictionDistribution_binary + - title: "Saving/loading models as json" + desc: > + Functions for saving or loading models as json + contents: + - sklearnFromJson + - sklearnToJson + - title: "Load/save for sharing" + desc: > + Functions for loading/saving objects for sharing + contents: + - savePlpShareable + - loadPlpShareable + - loadPrediction + - savePrediction + - title: "Feature importance" + contents: + - pfi + - title: "Other functions" + contents: + - predictCyclops + + diff --git a/docs/404.html b/docs/404.html index c63753b85..7dbf2ed28 100644 --- a/docs/404.html +++ b/docs/404.html @@ -1,66 +1,27 @@ - - - - + + + + - Page not found (404) • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + - - - - - + + + - - -
+
+
-
- +
+ + - - diff --git a/docs/articles/ATLAS_O.png b/docs/articles/ATLAS_O.png deleted file mode 100644 index 3cda2abf7..000000000 Binary files a/docs/articles/ATLAS_O.png and /dev/null differ diff --git a/docs/articles/ATLAS_T.png b/docs/articles/ATLAS_T.png deleted file mode 100644 index 8be57dc9e..000000000 Binary files a/docs/articles/ATLAS_T.png and /dev/null differ diff --git a/docs/articles/AddingCustomAlgorithms.html b/docs/articles/AddingCustomAlgorithms.html deleted file mode 100644 index d9dbffc64..000000000 --- a/docs/articles/AddingCustomAlgorithms.html +++ /dev/null @@ -1,527 +0,0 @@ - - - - - - - -Adding Custom Patient-Level Prediction Algorithms • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new algorithms with the OHDSI community through our GitHub repository.

-
-
-

-Algorithm Code Structure

-

Each algorithm in the package should be implemented in its own <Name>.R file, e.g. KNN.R, containing a set<Name> function and a fit<Name> function. Furthermore, a corresponding predict function in predict.R is needed (if there isn’t one available that would work, see example at the end of the document). We will now describe each of these functions in more detail below.

-
-

-Set

-

The set<Name> is a function that takes as input the different hyper-parameter values to do a grid search when training. The output of the functions needs to be a list as class modelSettings containing:

-
    -
  • param - all the combinations of the hyper-parameter values input
  • -
  • model - a string specifying what function to call to fit the model
  • -
  • name - a string containing the name of the model.
  • -
-

For example, if you were adding a model called madeUp that has two hyper-parameters then the set function should be:

-
-setMadeUp <- function(a=1, b=2, seed=NULL){
-  # add input checks here...
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', # this will be called to train the made up model
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-

-Fit

-

This function should train your custom model for each parameter entry, pick the best parameters and train a final model for that setting.

-

The fit<Model> should have as inputs:

-
    -
  • population - the study popualation the model is being developed on
  • -
  • plpData - the plpData object
  • -
  • param - the hyper-parameters as a list of all combinations
  • -
  • quiet - T or F indicating whether to output progress
  • -
  • outcomeId - the outcome id
  • -
  • cohortId - the target population id
  • -
-

The fit function should return a list of class plpModel with the following objects:

-
    -
  • model - a trained model
  • -
  • modelSettings - a list containing the model and input param
  • -
  • trainCVAuc - a value with the train AUC value
  • -
  • hyperParamSearch - a dataframe with the hyperparameter grid and corresponding AUCs
  • -
  • metaData - the metaData from the plpData object
  • -
  • populationSettings - the settings used to create the population and define the time-at-risk
  • -
  • outcomeId - the outcomeId being predicted
  • -
  • cohortId - the cohortId corresponding to the target cohort
  • -
  • varImp - a dataframe with the covaraites and a measure of importance
  • -
  • trainingTime - how long it took to develop/evaluate the model
  • -
  • covariateMap - if the plpData are converted to a matrix for model compatibility this tells us what covariate each row in the matrix correpsonds to and is need when implementing the model on new data
  • -
-

The plpModel returned by fit also has a type attribute, this points to the predict function, for example attr(result, 'type') <- 'madeup' means when the model is applied to new data, the ‘predict.madeup’ function in Predict.R is called. if this doesnt exist, then the model will fail. Another attribute is the predictionType attr(result, 'predictionType') <- 'binary' this is currently not needed but may be important in the future when we expand to regression or multiclass classification.

-

For example:

-
-fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # **************** code to train the model here
-  # trainedModel <- this code should apply each hyper-parameter using the cross validation
-  #                 then pick out the best hyper-parameter setting
-  #                 and finally fit a model on the whole train data using the 
-  #                 optimal hyper-parameter settings
-  # ****************
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-

You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function. As the end of the fit function specified attr(result, 'type') <- 'madeup' we also need to make sure there is a predict.madeup function in Predict.R:

-
-
-

-Predict

-

The prediction function takes as input the plpModel returned by fit, a population and corresponding plpData. It returns a data.frame with the columns:

-
    -
  • rowId - the id for each person in the population
  • -
  • value - the predicted risk from the plpModel
  • -
-

If the population contains the columns outcomeCount and indexes, then these are also in the output.

-

For example:

-
-predict.madeup <- function(plpModel,population, plpData, ...){ 
-
-  # ************* code to do prediction for each rowId in population
-  # prediction <- code to do prediction here returning columns: rowId 
-  #               and value (predicted risk)
-  #**************
-  
-  prediction <- merge(population, prediction, by='rowId')
-  prediction <- prediction[,colnames(prediction)%in%c('rowId','outcomeCount',
-                                                      'indexes', 'value')] 
-  attr(prediction, "metaData") <- list(predictionType = "binary") 
-  return(prediction)
-  
-}
-
-
-
-

-Algorithm Example

-

Below a fully functional algorithm example is given, however we highly recommend you to have a look at the available algorithms in the package.

-
-

-Set

-
setMadeUp <- function(a=1, b=2, seed=NULL){
-  # check a is valid positive value
-  if(missing(a)){
-    stop('a must be input')
-  }
-  if(!class(a)%in%c('numeric','integer'){
-    stop('a must be numeric')
-  }
-  if(a < 0){
-    stop('a must be positive')
-  }
-  # check b is numeric
-  if(missing(b)){
-    stop('b must be input')
-  }
-  if(!class(b)%in%c('numeric','integer'){
-    stop('b must be numeric')
-  }
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', 
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-    
-  
-}
-
-
-

-Fit

-
-fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-    if(!quiet)
-    writeLines('Training Made Up model')
-  
-  if(param[[1]]$seed!='NULL')
-    set.seed(param[[1]]$seed)
-  
-    # check plpData is coo format:
-  if(!'ffdf'%in%class(plpData$covariates) )
-    stop('This algorithm requires plpData in coo format')
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  # convert data into sparse R Matrix:
-  result <- toSparseM(plpData,population,map=NULL)
-  data <- result$data
-  
-  data <- data[population$rowId,]
-  
-  # set test/train sets (for printing performance as it trains)
-  if(!quiet)
-    writeLines(paste0('Training made up model on train set containing ', nrow(population), 
-                      ' people with ',sum(population$outcomeCount>0), ' outcomes'))
-  start <- Sys.time()
-  
-  #============= STEP 1 ======================================
-  # pick the best hyper-params and then do final training on all data...
-  writeLines('train')
-  datas <- list(population=population, data=data)
-  param.sel <- lapply(param, function(x) do.call(made_up_model, c(x,datas)  ))
-  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
-  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
-  param <- param[[which.max(param.sel)]]
-  
-  # set this so you do a final model train 
-  param$final=T
-  
-  writeLines('final train')
-  trainedModel <- do.call(made_up_model, c(param,datas)  )$model
-  
-  comp <- Sys.time() - start
-  if(!quiet)
-    writeLines(paste0('Model Made Up trained - took:',  format(comp, digits=3)))
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-
-
-

-Helpers

-

In the fit model a helper function made_up_model is called, this is the function that trains a model given the data and population (where the popualtion contains a column outcomeCount corresponding to the outcome). Both the data and population are ordered the same way:

-
-made_up_model <- function(data, population,
-                       a=1,b=1, final=F, ...){
-  
-  writeLines(paste('Training Made Up model with ',length(unique(population$indexes)),
-                   ' fold CV'))
-  if(!is.null(population$indexes) && final==F){
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),
-                       'train rows'))
-      model <- madeup::model(x = data[population$indexes!=index,],
-                             y= population$outcomeCount[population$indexes!=index],
-                                  a=a, b=b)
-      
-      pred <- stats::predict(model, data[population$indexes==index,])
-      prediction <- population[population$indexes==index,]
-      prediction$value <- pred
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- pred
-      
-     }
-    ##auc <- mean(perform) # want overal rather than mean
-    auc <- computeAuc(predictionMat)
-    
-    foldPerm <- perform
-  } else {
-    model <- madeup::model(x= data, 
-                                y= population$outcomeCount,
-                                a=a,b=b)
-    
-    pred <- stats::predict(model, data)
-    prediction <- population
-    prediction$value <- pred
-    attr(prediction, "metaData") <- list(predictionType = "binary") 
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-  }
-  
-  result <- list(model=model,
-                 auc=auc,
-                 hyperSum = unlist(list(a = a, b = b, fold_auc=foldPerm))
-  )
-  return(result)
-}
-
-
-

-Predict

-

The final step is to create a predict function for the model. This gets added to the predict.R file. In the example above the type attr(result, 'type') <- 'madeup' was madeup, so a predict.madeup function is required to be added into the predict.R. The predict function needs to take as input the plpModel returned by the fit function, the population to apply the model on and the plpData specifying the covariates of the population.

-
-predict.madeup <- function(plpModel,population, plpData, ...){ 
-  result <- toSparseM(plpData, population, map=plpModel$covariateMap)
-  data <- result$data[population$rowId,]
-  prediction <- data.frame(rowId=population$rowId, 
-                           value=stats::predict(plpModel$model, data)
-                           )
-  
-  prediction <- merge(population, prediction, by='rowId')
-  prediction <- prediction[,colnames(prediction)%in%
-                           c('rowId','outcomeCount','indexes', 'value')] # need to fix no index issue
-  attr(prediction, "metaData") <- list(predictionType = "binary") 
-  return(prediction)
-  
-}
-

As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the type of the result returned by fitMadeUpModel to attr(result, 'type') <- 'xgboost'.

-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - - -
- -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - diff --git a/docs/articles/AddingCustomAlgorithms_files/header-attrs-2.7/header-attrs.js b/docs/articles/AddingCustomAlgorithms_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomAlgorithms_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomFeatureEngineering.html b/docs/articles/AddingCustomFeatureEngineering.html index ba6f86c94..b7261a0cc 100644 --- a/docs/articles/AddingCustomFeatureEngineering.html +++ b/docs/articles/AddingCustomFeatureEngineering.html @@ -19,6 +19,8 @@ + +
+
@@ -146,138 +161,206 @@

2022-03-09

-
-

-Introduction

-

This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new feature engineering functions with the OHDSI community through our GitHub repository.

+
+

Introduction +

+

This vignette describes how you can add your own custom function for +feature engineering in the Observational Health Data Sciences and +Informatics (OHDSI) PatientLevelPrediction +package. This vignette assumes you have read and are comfortable with +building single patient level prediction models as described in the BuildingPredictiveModels +vignette.

+

We invite you to share your new feature engineering functions +with the OHDSI community through our GitHub +repository.

-
-

-Feature Engineering Function Code Structure

-

To make a custom feature engineering function that can be used within PatientLevelPrediction you need to write two different functions. The ‘create’ function and the ‘implement’ function.

-

The ‘create’ function, e.g., create<FeatureEngineeringFunctionName>, takes the parameters of the feature engineering ‘implement’ function as input, checks these are valid and outputs these as a list of class ‘featureEngineeringSettings’ with the ‘fun’ attribute specifying the ‘implement’ function to call.

-

The ‘implement’ function, e.g., implement<FeatureEngineeringFunctionName>, must take as input: * trainData - a list containing: - covariateData: the plpData$covariateData restricted to the training patients - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) * featureEngineeringSettings - the output of your create<FeatureEngineeringFunctionName>

-

The ‘implement’ function can then do any manipulation of the trainData (adding new features or removing features) but must output a trainData object containing the new covariateData, labels and folds for the training data patients.

+
+

Feature Engineering Function Code Structure +

+

To make a custom feature engineering function that can be used within +PatientLevelPrediction you need to write two different functions. The +‘create’ function and the ‘implement’ function.

+

The ‘create’ function, e.g., +create<FeatureEngineeringFunctionName>, takes the parameters of +the feature engineering ‘implement’ function as input, checks these are +valid and outputs these as a list of class ‘featureEngineeringSettings’ +with the ‘fun’ attribute specifying the ‘implement’ function to +call.

+

The ‘implement’ function, e.g., +implement<FeatureEngineeringFunctionName>, must take as input:

+
    +
  • +

    trainData - a list containing:

    +
      +
    • covariateData: the +plpData$covariateDatarestricted to the training +patients

    • +
    • labels: a data frame that contain +rowId(patient identifier) and outcomeCount +(the class labels)

    • +
    • folds: a data.frame that contains rowId +(patient identifier) and index (the cross validation +fold)

    • +
    +
  • +
  • featureEngineeringSettings - the output of your +create<FeatureEngineeringFunctionName>

  • +
+

The ‘implement’ function can then do any manipulation of the +trainData (adding new features or removing features) but +must output a trainData object containing the new +covariateData, labels and folds +for the training data patients.

-
-

-Example

-

Let’s consider the situation where we wish to create an age spline feature. To make this custom feature engineering function we need to write the ‘create’ and ‘implement’ R functions.

-
-

-Create function

-

Our age spline feature function will create a new feature using the plpData$cohorts ageYear column. We will implement a restricted cubic spline that requires specifying the number of knots. . Therefore, the inputs for this are: * knots an integer/double specifying the number of knots

+
+

Example +

+

Let’s consider the situation where we wish to create an age spline +feature. To make this custom feature engineering function we need to +write the ‘create’ and ‘implement’ R functions.

+
+

Create function +

+

Our age spline feature function will create a new feature using the +plpData$cohorts$ageYear column. We will implement a +restricted cubic spline that requires specifying the number of knots. +Therefore, the inputs for this are: knots - an +integer/double specifying the number of knots.

-createAgeSpine <- function(
-                     knots = 5
-                     ){
-  
-  # add input checks
-  checkIsClass(knots, c('numeric','integer'))
-  checkHigher(knots,0)
-  
-  # create list of inputs to implement function
-  featureEngineeringSettings <- list(
-    knots = knots
-    )
-  
-  # specify the function that will implement the sampling
-  attr(featureEngineeringSettings, "fun") <- "implementAgeSpine"
-
-  # make sure the object returned is of class "sampleSettings"
-  class(featureEngineeringSettings) <- "featureEngineeringSettings"
-  return(featureEngineeringSettings)
-  
-}
-

We now need to create the ‘implement’ function implementAgeSpine()

+createAgeSpline <- function( + knots = 5 + ){ + + # create list of inputs to implement function + featureEngineeringSettings <- list( + knots = knots + ) + + # specify the function that will implement the sampling + attr(featureEngineeringSettings, "fun") <- "implementAgeSplines" + + # make sure the object returned is of class "sampleSettings" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) + +}
+

We now need to create the ‘implement’ function +implementAgeSplines()

-
-

-Implement function

-

All ‘implement’ functions must take as input the trainData and the featureEngineeringSettings (this is the output of the ‘create’ function). They must return a trainData object containing the new covariateData, labels and folds.

-

In our example, the createAgeSpine() will return a list with ‘knots’. The featureEngineeringSettings therefore contains this.

+
+

Implement function +

+

All ‘implement’ functions must take as input the +trainData and the featureEngineeringSettings +(this is the output of the ‘create’ function). They must return a +trainData object containing the new +covariateData, labels and +folds.

+

In our example, the createAgeSpline() will return a list +with ‘knots’. The featureEngineeringSettings therefore +contains this.

-implementAgeSpine <- function(trainData, featureEngineeringSettings){
-
-  # currently not used
-  knots <- featureEngineeringSettings$knots
-  
-  
-  # age in in trainData$labels as ageYear
-  ageData <- trainData$labels
-  
-  # now implement the code to do your desired feature engineering
-  
-  data <- Matrix::sparseMatrix(
-    i = 1:length(ageData$rowId),
-    j = rep(1, length(ageData$rowId)),
-    x = ageData$ageYear,
-    dims=c(length(ageData$rowId),1)
-  )
-  
-  data <- as.matrix(data)
-  x <- data[,1]
-  y <- ageData$outcomeCount
-  
-mRCS <- rms::ols(
-  y~rms::rcs(x, 
-             stats::quantile(
-               x, 
-               c(0, .05, .275, .5, .775, .95, 1),
-               include.lowest = TRUE
-               )
-             )
-  )
-
-newData <- data.frame(
-  rowId = ageData$rowId,
-  covariateId = 2002,
-  covariateValue = mRCS$fitted.values
-  )
-
-# add new data
-Andromeda::appendToTable(tbl = trainData$covariateData$covariates, 
-                         data = newData)
-  
-  # return the updated trainData
-  return(trainData)
-}
+implementAgeSplines <- function(trainData, featureEngineeringSettings, model=NULL) { + # if there is a model, it means this function is called through applyFeatureengineering, meaning it # should apply the model fitten on training data to the test data + if (is.null(model)) { + knots <- featureEngineeringSettings$knots + ageData <- trainData$labels + y <- ageData$outcomeCount + X <- ageData$ageYear + model <- mgcv::gam( + y ~ s(X, bs='cr', k=knots, m=2) + ) + newData <- data.frame( + rowId = ageData$rowId, + covariateId = 2002, + covariateValue = model$fitted.values + ) + } + else { + ageData <- trainData$labels + X <- trainData$labels$ageYear + y <- ageData$outcomeCount + newData <- data.frame(y=y, X=X) + yHat <- predict(model, newData) + newData <- data.frame( + rowId = trainData$labels$rowId, + covariateId = 2002, + covariateValue = yHat + ) + } + + # remove existing age if in covariates + trainData$covariateData$covariates <- trainData$covariateData$covariates |> + dplyr::filter(!covariateId %in% c(1002)) + + # update covRef + Andromeda::appendToTable(trainData$covariateData$covariateRef, + data.frame(covariateId=2002, + covariateName='Cubic restricted age splines', + analysisId=2, + conceptId=2002)) + + # update covariates + Andromeda::appendToTable(trainData$covariateData$covariates, newData) + + featureEngineering <- list( + funct = 'implementAgeSplines', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + model = model + ) + ) + + attr(trainData$covariateData, 'metaData')$featureEngineering = listAppend( + attr(trainData$covariateData, 'metaData')$featureEngineering, + featureEngineering + ) + + return(trainData) +}
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

+

This work is supported in part through the National Science +Foundation grant IIS 1251151.

@@ -292,11 +375,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -305,5 +390,7 @@

+ + diff --git a/docs/articles/AddingCustomFeatureEngineering_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomFeatureEngineering_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomFeatureEngineering_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomModels.html b/docs/articles/AddingCustomModels.html index f588c269e..3cafc9a9e 100644 --- a/docs/articles/AddingCustomModels.html +++ b/docs/articles/AddingCustomModels.html @@ -19,6 +19,8 @@ + +
+
@@ -146,472 +162,589 @@

2022-03-09

-
-

-Introduction

-

This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new algorithms with the OHDSI community through our GitHub repository.

+
+

Introduction +

+

This vignette describes how you can add your own custom algorithms in +the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction +package. This allows you to fully leverage the OHDSI +PatientLevelPrediction framework for model development and validation. +This vignette assumes you have read and are comfortable with building +single patient level prediction models as described in the BuildingPredictiveModels +vignette.

+

We invite you to share your new algorithms with the OHDSI +community through our GitHub +repository.

-
-

-Algorithm Code Structure

-

Each algorithm in the package should be implemented in its own <Name>.R file, e.g. KNN.R, containing a set<Name> function, a fit<Name> function and a predict<Name> function. Occasionally the fit and prediction functions may be reused (if using an R classifier see RClassifier.R or if using a scikit-learn classifier see SklearnClassifier.R). We will now describe each of these functions in more detail below.

-
-

-Set

-

The set<Name> is a function that takes as input the different hyper-parameter values to do a grid search when training. The output of the functions needs to be a list as class modelSettings containing:

+
+

Algorithm Code Structure +

+

Each algorithm in the package should be implemented in its own +<Name>.R file, e.g. KNN.R, containing a set<Name> function, +a fit<Name> function and a predict<Name> function. +Occasionally the fit and prediction functions may be reused (if using an +R classifier see RClassifier.R or if using a scikit-learn classifier see +SklearnClassifier.R). We will now describe each of these functions in +more detail below.

+
+

Set +

+

The set<Name> is a function that takes as input the different +hyper-parameter values to do a grid search when training. The output of +the functions needs to be a list as class modelSettings +containing:

    -
  • param - all the combinations of the hyper-parameter values input
  • -
  • fitFunction - a string specifying what function to call to fit the model
  • +
  • param - all the combinations of the hyper-parameter values +input
  • +
  • fitFunction - a string specifying what function to call to fit the +model
-

The param object can have a setttings attribute containing any extra settings. For example to specify the model name and the seed used for reproducibility:

+

The param object can have a setttings attribute containing any extra +settings. For example to specify the model name and the seed used for +reproducibility:

-attr(param, 'settings') <- list(
-  seed = 12,
-  modelName = "Special classifier"
-  )
-

For example, if you were adding a model called madeUp that has two hyper-parameters then the set function should be:

+attr(param, 'settings') <- list( + seed = 12, + modelName = "Special classifier" + )
+

For example, if you were adding a model called madeUp that has two +hyper-parameters then the set function should be:

-setMadeUp <- function(a=c(1,4,10), b=2, seed=NULL){
-  # add input checks here...
-  
-  param <- split(
-    expand.grid(
-      a=a, 
-      b=b
-    ),
-    1:(length(a)*length(b))
-    )
-  
-  attr(param, 'settings') <- list(
-    modelName = "Made Up",
-    requiresDenseMatrix = TRUE,
-    seed = seed
-    )
-  
-  # now create list of all combinations:
-  result <- list(
-    fitFunction = 'fitMadeUp', # this will be called to train the made up model
-    param = param
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
+setMadeUp <- function(a=c(1,4,10), b=2, seed=NULL){ + # add input checks here... + + param <- split( + expand.grid( + a=a, + b=b + ), + 1:(length(a)*length(b)) + ) + + attr(param, 'settings') <- list( + modelName = "Made Up", + requiresDenseMatrix = TRUE, + seed = seed + ) + + # now create list of all combinations: + result <- list( + fitFunction = 'fitMadeUp', # this will be called to train the made up model + param = param + ) + class(result) <- 'modelSettings' + + return(result) +}
-
-

-Fit

-

This function should train your custom model for each parameter entry, pick the best parameters and train a final model for that setting.

+
+

Fit +

+

This function should train your custom model for each parameter +entry, pick the best parameters and train a final model for that +setting.

The fit<Model> should have as inputs:

    -
  • trainData - a list containing the covariateData, labels and folds for the training population
  • +
  • trainData - a list containing the covariateData, labels and folds +for the training population
  • param - the hyper-parameters as a list of all combinations
  • search - the type of hyper-parameter search
  • analysisId - an identifier for the analysis
-

The fit function should return a list of class plpModel with the following objects:

+

The fit function should return a list of class plpModel +with the following objects:

+
    +
  • model - a trained model (or location of the model if it is not an R +object)
  • +
  • prediction - a data.frame object with the trainData$labels plus an +extra column with the name ‘value’ corresponding to the predicted risk +of having the outcome during the time-at-risk.
  • +
  • preprocessing - the settings required to preprocess the data when +applying the model
      -
    • model - a trained model (or location of the model if it is not an R object)
    • -
    • prediction - a data.frame object with the trainData$labels plus an extra column with the name ‘value’ corresponding to the predicted risk of having the outcome during the time-at-risk.
    • -
    • settings - a list containing: +
    • featureEngineering - the feature engineering settings e.g., +attr(trainData\(covariateData, +"metaData")\)featureEngineering,
    • +
    • tidyCovariates - the preprocessing settings e.g., +attr(trainData\(covariateData, +"metaData")\)tidyCovariateDataSettings,
    • +
    • requireDenseMatrix - does the model require a dense matrix? e.g., +attr(param, ‘settings’)$requiresDenseMatrix,
    • +
    +
  • +
  • modelDesign - a list containing:
      -
    • plpDataSettings - the plpData settings e.g., attr(trainData, “metaData”)$plpDataSettings
    • -
    • covariateSettings - the covariate settings e.g., attr(trainData, “metaData”)$covariateSettings
    • -
    • populationSettings - the population settings e.g., attr(trainData, “metaData”)$populationSettings,
    • -
    • featureEngineering - the feature engineering settings e.g., attr(trainData\(covariateData, "metaData")\)featureEngineering,
    • -
    • tidyCovariates - the preprocessing settings e.g., attr(trainData\(covariateData, "metaData")\)tidyCovariateDataSettings,
    • -
    • requireDenseMatrix - does the model require a dense matrix? e.g., attr(param, ‘settings’)$requiresDenseMatrix,
    • -
    • modelSettings = a list containing: model (model name), param (the hyper-parameter search list), finalModelParameters (the final model hyper-parameters), extraSettings (any extra settings)
    • -
    • splitSettings - the split settings e.g., attr(trainData, “metaData”)$splitSettings,
    • -
    • sampleSettings - the sample settings e.g., attr(trainData, “metaData”)$sampleSettings
    • +
    • targetId - the id of the target cohort
    • +
    • outcomeId - the id of the outcome cohort
    • +
    • plpDataSettings - the plpData settings e.g., attr(trainData, +“metaData”)$plpDataSettings
    • +
    • covariateSettings - the covariate settings e.g., attr(trainData, +“metaData”)$covariateSettings
    • +
    • populationSettings - the population settings e.g., attr(trainData, +“metaData”)$populationSettings,
    • +
    • featureEngineeringSettings - the feature engineering settings e.g., +attr(trainData\(covariateData, +"metaData")\)featureEngineeringSettings,
    • +
    • preprocessSettings - the preprocessing settings e.g., +attr(trainData\(covariateData, +"metaData")\)preprocessSettings,
    • +
    • modelSettings = a list containing: model (model name), param (the +hyper-parameter search list), finalModelParameters (the final model +hyper-parameters), extraSettings (any extra settings)
    • +
    • splitSettings - the split settings e.g., attr(trainData, +“metaData”)$splitSettings,
    • +
    • sampleSettings - the sample settings e.g., attr(trainData, +“metaData”)$sampleSettings
  • trainDetails - a list containing:
    • analysisId - the identifier for the analysis
    • -
    • cdmDatabaseSchema - the database used to develop the model
    • -
    • outcomeId - the outcome id
    • -
    • cohortId - the target population id
    • +
    • developmentDatabase - the database used to develop the model
    • attrition - the attrition
    • trainingTime - how long it took to train the model
    • trainingDate - date of model training
    • -
    • hyperParamSearch - the hyper-parameter search used to train the model
    • +
    • hyperParamSearch - the hyper-parameter search used to train the +model
    • +
    • any other objects specific to training
  • -
  • covariateImportance - a data.frame containing the columns ‘covariateId’, ‘covariateValue’ (the variable importance) and ‘columnId’ (the column number that the variable need to be mapped to when implementing the model)
  • +
  • covariateImportance - a data.frame containing the columns +‘covariateId’, ‘covariateValue’ (the variable importance) and ‘columnId’ +(the column number that the variable need to be mapped to when +implementing the model)

In additon the plpModel requires two attributes:

    -
  • predictionFunction - the name of the function used to make predictions
  • +
  • predictionFunction - the name of the function used to make +predictions
  • modelType - whether the model is ‘binary’ or ‘survival’
-

For example attr(result, 'predictionFunction') <- 'madeupPrediction' means when the model is applied to new data, the ‘madeupPrediction’ function is called to make predictions. If this doesnt exist, then the model will fail. The other attribute is the modelType attr(result, 'modelType') <- 'binary' this is needed when evaluating the model to ensure the correct evaluation is applied. Currently the evaluation supports ‘binary’ and ‘survival’ modelType.

-

Note: If a new modelType is desired, then the evalaution code within PatientLevelPrediction must be updated to specify how the new type is evaluated. This requires making edits to PatientLevelPrediction and then making a pull request to the PatientLevelPrediction github. The evaluation cannot have one off customization because the evaluation must be standardized to enable comparison across similar models.

+

For example +attr(result, 'predictionFunction') <- 'madeupPrediction' +means when the model is applied to new data, the ‘madeupPrediction’ +function is called to make predictions. If this doesnt exist, then the +model will fail. The other attribute is the modelType +attr(result, 'modelType') <- 'binary' this is needed +when evaluating the model to ensure the correct evaluation is applied. +Currently the evaluation supports ‘binary’ and ‘survival’ modelType.

+

Note: If a new modelType is desired, then the evalaution code within +PatientLevelPrediction must be updated to specify how the new type is +evaluated. This requires making edits to PatientLevelPrediction and then +making a pull request to the PatientLevelPrediction github. The +evaluation cannot have one off customization because the evaluation must +be standardized to enable comparison across similar models.

A full example of a custom ‘binary’ classifier fit function is:

-fitMadeUp <- function(trainData, param, search, analysisId){
-  
-  # **************** code to train the model here
-  # trainedModel <- this code should apply each hyper-parameter combination   
-  # (param[[i]]) using the specified search (e.g., cross validation)
-  #                 then pick out the best hyper-parameter setting
-  #                 and finally fit a model on the whole train data using the 
-  #                 optimal hyper-parameter settings
-  # ****************
-  
-  # **************** code to apply the model to trainData
-  # prediction <- code to apply trainedModel to trainData
-  # ****************
-  
-  # **************** code to get variable importance (if possible)
-  # varImp <- code to get importance of each variable in trainedModel
-  # ****************
-  
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 prediction = prediction, # the train and maybe the cross validation predictions for the trainData
-    settings = list(
-      plpDataSettings = attr(trainData, "metaData")$plpDataSettings,
-      covariateSettings = attr(trainData, "metaData")$covariateSettings,
-      populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, 
-      requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix,
-      modelSettings = list(
-        model = attr(param, 'settings')$modelName, # the model name
-        param = param,
-        finalModelParameters = param[[bestInd]], # best hyper-parameters
-        extraSettings = attr(param, 'settings')
-      ),
-      splitSettings = attr(trainData, "metaData")$splitSettings,
-      sampleSettings = attr(trainData, "metaData")$sampleSettings
-    ),
-    
-    trainDetails = list(
-      analysisId = analysisId,
-      cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
-      outcomeId = attr(trainData, "metaData")$outcomeId,
-      cohortId = attr(trainData, "metaData")$cohortId,
-      attrition = attr(trainData, "metaData")$attrition, 
-      trainingTime = timeToTrain, # how long it took to train the model
-      trainingDate = Sys.Date(),
-      hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame
-    ),
-    covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'predictionFunction') <- 'madeupPrediction'
-  attr(result, 'modelType') <- 'binary'
-  return(result)
-    
-}
-

You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function (for example a function to run cross validation). It is important to ensure there is a valid prediction function (the one specified by attr(result, 'predictionFunction') <- 'madeupPrediction' is madeupPrediction()) as specified below.

+fitMadeUp <- function(trainData, modelSettings, search, analysisId){ + + param <- modelSettings$param + + # **************** code to train the model here + # trainedModel <- this code should apply each hyper-parameter combination + # (param[[i]]) using the specified search (e.g., cross validation) + # then pick out the best hyper-parameter setting + # and finally fit a model on the whole train data using the + # optimal hyper-parameter settings + # **************** + + # **************** code to apply the model to trainData + # prediction <- code to apply trainedModel to trainData + # **************** + + # **************** code to get variable importance (if possible) + # varImp <- code to get importance of each variable in trainedModel + # **************** + + + # construct the standard output for a model: + result <- list(model = trainedModel, + prediction = prediction, # the train and maybe the cross validation predictions for the trainData + preprocessing = list( + featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, + + ), + modelDesign = list( + outcomeId = attr(trainData, "metaData")$outcomeId, + targetId = attr(trainData, "metaData")$targetId, + plpDataSettings = attr(trainData, "metaData")$plpDataSettings, + covariateSettings = attr(trainData, "metaData")$covariateSettings, + populationSettings = attr(trainData, "metaData")$populationSettings, + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, + modelSettings = list( + model = attr(param, 'settings')$modelName, # the model name + param = param, + finalModelParameters = param[[bestInd]], # best hyper-parameters + extraSettings = attr(param, 'settings') + ), + splitSettings = attr(trainData, "metaData")$splitSettings, + sampleSettings = attr(trainData, "metaData")$sampleSettings + ), + + trainDetails = list( + analysisId = analysisId, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, + attrition = attr(trainData, "metaData")$attrition, + trainingTime = timeToTrain, # how long it took to train the model + trainingDate = Sys.Date(), + hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame + ), + covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible + ) + class(result) <- 'plpModel' + attr(result, 'predictionFunction') <- 'madeupPrediction' + attr(result, 'modelType') <- 'binary' + return(result) + +}
+

You could make the fitMadeUp function cleaner by adding helper +function in the MadeUp.R file that are called by the fit function (for +example a function to run cross validation). It is important to ensure +there is a valid prediction function (the one specified by +attr(result, 'predictionFunction') <- 'madeupPrediction' +is madeupPrediction()) as specified below.

-
-

-Predict

-

The prediction function takes as input the plpModel returned by fit, new data and a corresponding cohort. It returns a data.frame with the same columns as cohort but with an additional column:

+
+

Predict +

+

The prediction function takes as input the plpModel returned by fit, +new data and a corresponding cohort. It returns a data.frame with the +same columns as cohort but with an additional column:

    -
  • value - the predicted risk from the plpModel for each patient in the cohort
  • +
  • value - the predicted risk from the plpModel for each patient in the +cohort

For example:

-madeupPrediction <- function(plpModel, data, cohort){ 
-
-  # ************* code to do prediction for each rowId in cohort
-  # predictionValues <- code to do prediction here returning the predicted risk
-  #               (value) for each rowId in cohort 
-  #**************
-  
-  prediction <- merge(cohort, predictionValues, by='rowId')
-  attr(prediction, "metaData") <- list(modelType = attr(plpModel, 'modelType')) 
-  return(prediction)
-  
-}
+madeupPrediction <- function(plpModel, data, cohort){ + + # ************* code to do prediction for each rowId in cohort + # predictionValues <- code to do prediction here returning the predicted risk + # (value) for each rowId in cohort + #************** + + prediction <- merge(cohort, predictionValues, by='rowId') + attr(prediction, "metaData") <- list(modelType = attr(plpModel, 'modelType')) + return(prediction) + +}
-
-

-Algorithm Example

-

Below a fully functional algorithm example is given, however we highly recommend you to have a look at the available algorithms in the package (see GradientBoostingMachine.R for the set function, RClassifier.R for the fit and prediction function for R classifiers).

-
-

-Set

+
+

Algorithm Example +

+

Below a fully functional algorithm example is given, however we +highly recommend you to have a look at the available algorithms in the +package (see GradientBoostingMachine.R for the set function, +RClassifier.R for the fit and prediction function for R +classifiers).

+
+

Set +

-setMadeUp <- function(a=c(1,4,6), b=2, seed=NULL){
-  # add input checks here...
-  
-  if(is.null(seed)){
-    seed <- sample(100000,1)
-  }
-  
-  param <- split(
-    expand.grid(
-      a=a, 
-      b=b
-    ),
-    1:(length(a)*length(b))
-    )
-  
-  attr(param, 'settings') <- list(
-    modelName = "Made Up",
-    requiresDenseMatrix = TRUE,
-    seed = seed
-    )
-  
-  # now create list of all combinations:
-  result <- list(
-    fitFunction = 'fitMadeUp', # this will be called to train the made up model
-    param = param
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
+setMadeUp <- function(a=c(1,4,6), b=2, seed=NULL){ + # add input checks here... + + if(is.null(seed)){ + seed <- sample(100000,1) + } + + param <- split( + expand.grid( + a=a, + b=b + ), + 1:(length(a)*length(b)) + ) + + attr(param, 'settings') <- list( + modelName = "Made Up", + requiresDenseMatrix = TRUE, + seed = seed + ) + + # now create list of all combinations: + result <- list( + fitFunction = 'fitMadeUp', # this will be called to train the made up model + param = param + ) + class(result) <- 'modelSettings' + + return(result) +}
-
-

-Fit

-
-fitMadeUp <- function(trainData, param, search, analysisId){
-
-  # set the seed for reproducibility
-  set.seed(attr(param, 'settings')$seed)
-  
-  # add folds to labels:
-  trainData$labels <- merge(trainData$labels, trainData$folds, by= 'rowId')
-  # convert data into sparse R Matrix:
-  mappedData <- toSparseM(trainData,map=NULL)
-  matrixData <- mappedData$dataMatrix
-  labels <- mappedData$labels
-  covariateRef <- mappedData$covariateRef
-
-  #============= STEP 1 ======================================
-  # pick the best hyper-params and then do final training on all data...
-  writeLines('Cross validation')
-  param.sel <- lapply(
-    param, 
-    function(x){
-      do.call(
-        made_up_model, 
-        list(
-          param = x, 
-          final = F, 
-          data = matrixData, 
-          labels = labels
-          )  
-      )
-      }
-    )
-  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
-  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
-  bestInd <- which.max(param.sel)
-  
-  #get cross val prediction for best hyper-parameters
-  prediction <- param.sel[[bestInd]]$prediction
-  prediction$evaluationType <- 'CV'
-  
-  writeLines('final train')
-  finalResult <- do.call(
-    made_up_model, 
-    list(
-      param = param[[bestInd]], 
-      final = T, 
-      data = matrixData, 
-      labels = labels
-      )  
-    )
-  
-  trainedModel <- finalResult$model
-  
-  # prediction risk on training data:
-  finalResult$prediction$evaluationType <- 'Train'
-  
-  # get CV and train prediction
-  prediction <- rbind(prediction, finalResult$prediction)
-  
-  varImp <- covariateRef %>% dplyr::collect()
-  # no feature importance available
-  vqrImp$covariateValue <- 0 
-  
- timeToTrain <- Sys.time() - start
-
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 prediction = prediction, 
-    settings = list(
-      plpDataSettings = attr(trainData, "metaData")$plpDataSettings,
-      covariateSettings = attr(trainData, "metaData")$covariateSettings,
-      populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, 
-      requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix,
-      modelSettings = list(
-        model = attr(param, 'settings')$modelName, # the model name
-        param = param,
-        finalModelParameters = param[[bestInd]], # best hyper-parameters
-        extraSettings = attr(param, 'settings')
-      ),
-      splitSettings = attr(trainData, "metaData")$splitSettings,
-      sampleSettings = attr(trainData, "metaData")$sampleSettings
-    ),
-    
-    trainDetails = list(
-      analysisId = analysisId,
-      cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
-      outcomeId = attr(trainData, "metaData")$outcomeId,
-      cohortId = attr(trainData, "metaData")$cohortId,
-      attrition = attr(trainData, "metaData")$attrition, 
-      trainingTime = timeToTrain, # how long it took to train the model
-      trainingDate = Sys.Date(),
-      hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame
-    ),
-    covariateImportance = varImp
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'predictionFunction') <- 'madeupPrediction'
-  attr(result, 'modelType') <- 'binary'
-  return(result)
-    
-}
+
+

Fit +

+
fitMadeUp <- function(trainData, modelSettings, search, analysisId){
+
+  # set the seed for reproducibility
+  param <- modelSettings$param
+  set.seed(attr(param, 'settings')$seed)
+  
+  # add folds to labels:
+  trainData$labels <- merge(trainData$labels, trainData$folds, by= 'rowId')
+  # convert data into sparse R Matrix:
+  mappedData <- toSparseM(trainData,map=NULL)
+  matrixData <- mappedData$dataMatrix
+  labels <- mappedData$labels
+  covariateRef <- mappedData$covariateRef
+
+  #============= STEP 1 ======================================
+  # pick the best hyper-params and then do final training on all data...
+  writeLines('Cross validation')
+  param.sel <- lapply(
+    param, 
+    function(x){
+      do.call(
+        made_up_model, 
+        list(
+          param = x, 
+          final = F, 
+          data = matrixData, 
+          labels = labels
+          )  
+      )
+      }
+    )
+  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
+  hyperSummary <- as.data.frame(hyperSummary)
+  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
+  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
+  bestInd <- which.max(param.sel)
+  
+  #get cross val prediction for best hyper-parameters
+  prediction <- param.sel[[bestInd]]$prediction
+  prediction$evaluationType <- 'CV'
+  
+  writeLines('final train')
+  finalResult <- do.call(
+    made_up_model, 
+    list(
+      param = param[[bestInd]], 
+      final = T, 
+      data = matrixData, 
+      labels = labels
+      )  
+    )
+  
+  trainedModel <- finalResult$model
+  
+  # prediction risk on training data:
+  finalResult$prediction$evaluationType <- 'Train'
+  
+  # get CV and train prediction
+  prediction <- rbind(prediction, finalResult$prediction)
+  
+  varImp <- covariateRef %>% dplyr::collect()
+  # no feature importance available
+  vqrImp$covariateValue <- 0 
+  
+ timeToTrain <- Sys.time() - start
+
+  # construct the standard output for a model:
+  result <- list(model = trainedModel,
+                 prediction = prediction, 
+    preprocessing = list(
+                   featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
+      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, 
+      requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix,
+      
+                 ),
+    modelDesign = list(
+      outcomeId = attr(trainData, "metaData")$outcomeId,
+      targetId = attr(trainData, "metaData")$targetId,
+      plpDataSettings = attr(trainData, "metaData")$plpDataSettings,
+      covariateSettings = attr(trainData, "metaData")$covariateSettings,
+      populationSettings = attr(trainData, "metaData")$populationSettings,
+      featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings,
+      prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, 
+      modelSettings = list(
+        model = attr(param, 'settings')$modelName, # the model name
+        param = param,
+        finalModelParameters = param[[bestInd]], # best hyper-parameters
+        extraSettings = attr(param, 'settings')
+      ),
+      splitSettings = attr(trainData, "metaData")$splitSettings,
+      sampleSettings = attr(trainData, "metaData")$sampleSettings
+    ),
+    
+    trainDetails = list(
+      analysisId = analysisId,
+      developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema,
+      attrition = attr(trainData, "metaData")$attrition, 
+      trainingTime = timeToTrain, # how long it took to train the model
+      trainingDate = Sys.Date(),
+      hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame
+    ),
+    covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible
+  ),
+    covariateImportance = varImp
+  )
+  class(result) <- 'plpModel'
+  attr(result, 'predictionFunction') <- 'madeupPrediction'
+  attr(result, 'modelType') <- 'binary'
+  return(result)
+    
+}
-
-

-Helpers

-

In the fit model a helper function made_up_model is called, this is the function that trains a model given the data, labels and hyper-parameters.

+
+

Helpers +

+

In the fit model a helper function made_up_model is +called, this is the function that trains a model given the data, labels +and hyper-parameters.

-made_up_model <- function(param, data, final=F, labels){
-  
-  if(final==F){
-    # add value column to store all predictions
-    labels$value <- rep(0, nrow(labels))
-    attr(labels, "metaData") <- list(modelType = "binary")
-    
-    foldPerm <- c() # this holds CV aucs
-    for(index in 1:max(labels$index)){
-      model <- madeup::model(
-        x = data[labels$index!=index,], # remove left out fold
-        y = labels$outcomeCount[labels$index!=index],
-        a = param$a, 
-        b = param$b
-      )
-      
-      # predict on left out fold
-      pred <- stats::predict(model, data[labels$index==index,])
-      labels$value[labels$index==index] <- pred
-      
-      # calculate auc on help out fold  
-      aucVal <- computeAuc(labels[labels$index==index,])
-      foldPerm<- c(foldPerm,aucVal)    
-    }
-    auc <- computeAuc(labels) # overal AUC
-
-  } else {
-    model <- madeup::model(
-      x = data, 
-      y = labels$outcomeCount,
-      a = param$a,
-      b = param$b
-      )
-    
-    pred <- stats::predict(model, data)
-    labels$value <- pred
-    attr(labels, "metaData") <- list(modelType = "binary") 
-    auc <- computeAuc(labels)
-    foldPerm <- auc
-  }
-  
-  result <- list(
-    model = model,
-    auc = auc,
-    prediction = labels,
-    hyperSum = c(a = a, b = b, fold_auc = foldPerm)
-  )
-  
-  return(result)
-}
+made_up_model <- function(param, data, final=F, labels){ + + if(final==F){ + # add value column to store all predictions + labels$value <- rep(0, nrow(labels)) + attr(labels, "metaData") <- list(modelType = "binary") + + foldPerm <- c() # this holds CV aucs + for(index in 1:max(labels$index)){ + model <- madeup::model( + x = data[labels$index!=index,], # remove left out fold + y = labels$outcomeCount[labels$index!=index], + a = param$a, + b = param$b + ) + + # predict on left out fold + pred <- stats::predict(model, data[labels$index==index,]) + labels$value[labels$index==index] <- pred + + # calculate auc on help out fold + aucVal <- computeAuc(labels[labels$index==index,]) + foldPerm<- c(foldPerm,aucVal) + } + auc <- computeAuc(labels) # overal AUC + + } else { + model <- madeup::model( + x = data, + y = labels$outcomeCount, + a = param$a, + b = param$b + ) + + pred <- stats::predict(model, data) + labels$value <- pred + attr(labels, "metaData") <- list(modelType = "binary") + auc <- computeAuc(labels) + foldPerm <- auc + } + + result <- list( + model = model, + auc = auc, + prediction = labels, + hyperSum = c(a = a, b = b, fold_auc = foldPerm) + ) + + return(result) +}
-
-

-Predict

-

The final step is to create a predict function for the model. In the example above the predeiction function attr(result, 'predictionFunction') <- 'madeupPrediction' was madeupPrediction, so a madeupPrediction function is required when applying the model. The predict function needs to take as input the plpModel returned by the fit function, new data to apply the model on and the cohort specifying the patients of interest to make the prediction for.

+
+

Predict +

+

The final step is to create a predict function for the model. In the +example above the predeiction function +attr(result, 'predictionFunction') <- 'madeupPrediction' +was madeupPrediction, so a madeupPrediction function is +required when applying the model. The predict function needs to take as +input the plpModel returned by the fit function, new data to apply the +model on and the cohort specifying the patients of interest to make the +prediction for.

-madeupPrediction <- function(plpModel, data , cohort){ 
-  
-  if(class(data) == 'plpData'){
-    # convert
-    matrixObjects <- toSparseM(
-      plpData = data, 
-      cohort = cohort,
-      map = plpModel$covariateImportance %>% 
-        dplyr::select(.data$columnId, .data$covariateId)
-    )
-    
-    newData <- matrixObjects$dataMatrix
-    cohort <- matrixObjects$labels
-    
-  }else{
-    newData <- data
-  }
-  
-  if(class(plpModel) == 'plpModel'){
-    model <- plpModel$model
-  } else{
-    model <- plpModel
-  }
-  
-  cohort$value <- stats::predict(model, data)
-  
-  # fix the rowIds to be the old ones
-  # now use the originalRowId and remove the matrix rowId
-  cohort <- cohort %>% 
-    dplyr::select(-.data$rowId) %>%
-    dplyr::rename(rowId = .data$originalRowId)
-  
-  attr(cohort, "metaData") <- list(modelType = attr(plpModel, 'modelType')) 
-  return(cohort)
-  
-}
-

As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the predictionFunction of the result returned by fitMadeUpModel to attr(result, 'predictionFunction') <- 'predictXgboost'.

+madeupPrediction <- function(plpModel, data , cohort){ + + if(class(data) == 'plpData'){ + # convert + matrixObjects <- toSparseM( + plpData = data, + cohort = cohort, + map = plpModel$covariateImportance %>% + dplyr::select("columnId", "covariateId") + ) + + newData <- matrixObjects$dataMatrix + cohort <- matrixObjects$labels + + }else{ + newData <- data + } + + if(class(plpModel) == 'plpModel'){ + model <- plpModel$model + } else{ + model <- plpModel + } + + cohort$value <- stats::predict(model, data) + + # fix the rowIds to be the old ones + # now use the originalRowId and remove the matrix rowId + cohort <- cohort %>% + dplyr::select(-"rowId") %>% + dplyr::rename(rowId = "originalRowId") + + attr(cohort, "metaData") <- list(modelType = attr(plpModel, 'modelType')) + return(cohort) + +}
+

As the madeup model uses the standard R prediction, it has the same +prediction function as xgboost, so we could have not added a new +prediction function and instead made the predictionFunction of the +result returned by fitMadeUpModel to +attr(result, 'predictionFunction') <- 'predictXgboost'.

-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

+

This work is supported in part through the National Science +Foundation grant IIS 1251151.

@@ -626,11 +759,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -639,5 +774,7 @@

+ + diff --git a/docs/articles/AddingCustomModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomSamples.html b/docs/articles/AddingCustomSamples.html index c6a73fa11..84ccdde27 100644 --- a/docs/articles/AddingCustomSamples.html +++ b/docs/articles/AddingCustomSamples.html @@ -19,6 +19,8 @@ + +
+
@@ -146,137 +160,178 @@

2022-03-09

-
-

-Introduction

-

This vignette describes how you can add your own custom function for sampling the target population in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new sample functions with the OHDSI community through our GitHub repository.

+
+

Introduction +

+

This vignette describes how you can add your own custom function for +sampling the target population in the Observational Health Data Sciencs +and Informatics (OHDSI) PatientLevelPrediction +package. This vignette assumes you have read and are comfortable with +building single patient level prediction models as described in the BuildingPredictiveModels +vignette.

+

We invite you to share your new sample functions with the +OHDSI community through our GitHub +repository.

-
-

-Sample Function Code Structure

-

To make a sampling function that can be used within PatientLevelPrediction you need to write two different functions. The ‘create’ function and the ‘implement’ function.

-

The ‘create’ function, e.g., create<SampleFunctionName>, takes the parameters of the sample ‘implement’ function as input, checks these are valid and outputs these as a list of class ‘sampleSettings’ with the ‘fun’ attribute specifying the ‘implement’ function to call.

-

The ‘implement’ function, e.g., implement<SampleFunctionName>, must take as input: * trainData - a list containing: - covariateData: the plpData$covariateData restricted to the training patients - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) * sampleSettings - the output of your create<SampleFunctionName>

-

The ‘implement’ function can then do any manipulation of the trainData (such as undersampling or oversampling) but must output a trainData object containing the covariateData, labels and folds for the new training data sample.

+
+

Sample Function Code Structure +

+

To make a sampling function that can be used within +PatientLevelPrediction you need to write two different functions. The +‘create’ function and the ‘implement’ function.

+

The ‘create’ function, e.g., create<SampleFunctionName>, takes +the parameters of the sample ‘implement’ function as input, checks these +are valid and outputs these as a list of class ‘sampleSettings’ with the +‘fun’ attribute specifying the ‘implement’ function to call.

+

The ‘implement’ function, e.g., implement<SampleFunctionName>, +must take as input: * trainData - a list containing: - covariateData: +the plpData$covariateData restricted to the training patients - labels: +a data frame that contain rowId (patient identifier) and outcomeCount +(the class labels) - folds: a data.frame that contains rowId (patient +identifier) and index (the cross validation fold) * sampleSettings - the +output of your create<SampleFunctionName>

+

The ‘implement’ function can then do any manipulation of the +trainData (such as undersampling or oversampling) but must output a +trainData object containing the covariateData, labels and folds for the +new training data sample.

-
-

-Example

-

Let’s consider the situation where we wish to take a random sample of the training data population. To make this custom sampling function we need to write the ‘create’ and ‘implement’ R functions.

-
-

-Create function

-

Our random sampling function will randomly sample n patients from the trainData. Therefore, the inputs for this are: * n an integer/double specifying the number of patients to sample * sampleSeed an integer/double specifying the seed for reproducibility

+
+

Example +

+

Let’s consider the situation where we wish to take a random sample of +the training data population. To make this custom sampling function we +need to write the ‘create’ and ‘implement’ R functions.

+
+

Create function +

+

Our random sampling function will randomly sample n +patients from the trainData. Therefore, the inputs for this are: * +n an integer/double specifying the number of patients to +sample * sampleSeed an integer/double specifying the seed +for reproducibility

-createRandomSampleSettings <- function(
-                     n = 10000,
-                     sampleSeed = sample(10000,1)
-                     ){
-  
-  # add input checks
-  checkIsClass(n, c('numeric','integer'))
-  checkHigher(n,0)
-  checkIsClass(sampleSeed, c('numeric','integer'))
-  
-  # create list of inputs to implement function
-  sampleSettings <- list(
-    n = n,
-    sampleSeed  = sampleSeed 
-    )
-  
-  # specify the function that will implement the sampling
-  attr(sampleSettings, "fun") <- "implementRandomSampleSettings"
-
-  # make sure the object returned is of class "sampleSettings"
-  class(sampleSettings) <- "sampleSettings"
-  return(sampleSettings)
-  
-}
-

We now need to create the ‘implement’ function implementRandomSampleSettings()

+createRandomSampleSettings <- function( + n = 10000, + sampleSeed = sample(10000,1) + ){ + + # add input checks + checkIsClass(n, c('numeric','integer')) + checkHigher(n,0) + checkIsClass(sampleSeed, c('numeric','integer')) + + # create list of inputs to implement function + sampleSettings <- list( + n = n, + sampleSeed = sampleSeed + ) + + # specify the function that will implement the sampling + attr(sampleSettings, "fun") <- "implementRandomSampleSettings" + + # make sure the object returned is of class "sampleSettings" + class(sampleSettings) <- "sampleSettings" + return(sampleSettings) + +}
+

We now need to create the ‘implement’ function +implementRandomSampleSettings()

-
-

-Implement function

-

All ‘implement’ functions must take as input the trainData and the sampleSettings (this is the output of the ‘create’ function). They must return a trainData object containing the covariateData, labels and folds.

-

In our example, the createRandomSampleSettings() will return a list with ‘n’ and ‘sampleSeed’. The sampleSettings therefore contains these.

+
+

Implement function +

+

All ‘implement’ functions must take as input the trainData and the +sampleSettings (this is the output of the ‘create’ function). They must +return a trainData object containing the covariateData, labels and +folds.

+

In our example, the createRandomSampleSettings() will +return a list with ‘n’ and ‘sampleSeed’. The sampleSettings therefore +contains these.

-implementRandomSampleSettings <- function(trainData, sampleSettings){
-
-  n <- sampleSetting$n
-  sampleSeed <- sampleSetting$sampleSeed
-  
-  if(n > nrow(trainData$labels)){
-    stop('Sample n bigger than training population')
-  }
-  
-  # set the seed for the randomization
-  set.seed(sampleSeed)
-  
-  # now implement the code to do your desired sampling
-  
-  sampleRowIds <- sample(trainData$labels$rowId, n)
-  
-  sampleTrainData <- list()
-  
-  sampleTrainData$labels <- trainData$labels %>% 
-    dplyr::filter(.data$rowId %in% sampleRowIds) %>% 
-    dplyr::collect()
-  
-  sampleTrainData$folds <- trainData$folds %>% 
-    dplyr::filter(.data$rowId %in% sampleRowIds) %>% 
-    dplyr::collect()
-  
-  sampleTrainData$covariateData <- Andromeda::andromeda()
-  sampleTrainData$covariateData$covariateRef <-trainData$covariateData$covariateRef
-  sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% sampleRowIds)
-  
-  #update metaData$populationSize 
-  metaData <- attr(trainData$covariateData, 'metaData')
-  metaData$populationSize = n
-  attr(sampleTrainData$covariateData, 'metaData') <- metaData
-  
-  # make the cocvariateData the correct class
-  class(sampleTrainData$covariateData) <- 'CovariateData'
-  
-  # return the updated trainData
-  return(sampleTrainData)
-}
+implementRandomSampleSettings <- function(trainData, sampleSettings){ + + n <- sampleSetting$n + sampleSeed <- sampleSetting$sampleSeed + + if(n > nrow(trainData$labels)){ + stop('Sample n bigger than training population') + } + + # set the seed for the randomization + set.seed(sampleSeed) + + # now implement the code to do your desired sampling + + sampleRowIds <- sample(trainData$labels$rowId, n) + + sampleTrainData <- list() + + sampleTrainData$labels <- trainData$labels %>% + dplyr::filter(.data$rowId %in% sampleRowIds) %>% + dplyr::collect() + + sampleTrainData$folds <- trainData$folds %>% + dplyr::filter(.data$rowId %in% sampleRowIds) %>% + dplyr::collect() + + sampleTrainData$covariateData <- Andromeda::andromeda() + sampleTrainData$covariateData$covariateRef <-trainData$covariateData$covariateRef + sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% sampleRowIds) + + #update metaData$populationSize + metaData <- attr(trainData$covariateData, 'metaData') + metaData$populationSize = n + attr(sampleTrainData$covariateData, 'metaData') <- metaData + + # make the cocvariateData the correct class + class(sampleTrainData$covariateData) <- 'CovariateData' + + # return the updated trainData + return(sampleTrainData) +}
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

+

This work is supported in part through the National Science +Foundation grant IIS 1251151.

@@ -291,11 +346,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -304,5 +361,7 @@

+ + diff --git a/docs/articles/AddingCustomSamples_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomSamples_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomSamples_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingCustomSplitting.html b/docs/articles/AddingCustomSplitting.html index 24683120c..6f56760db 100644 --- a/docs/articles/AddingCustomSplitting.html +++ b/docs/articles/AddingCustomSplitting.html @@ -19,6 +19,8 @@ + +
+
@@ -146,103 +160,144 @@

2022-03-09

-
-

-Introduction

-

This vignette describes how you can add your own custom function for splitting the labelled data into training data and validation data in the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

We invite you to share your new data splitting functions with the OHDSI community through our GitHub repository.

+
+

Introduction +

+

This vignette describes how you can add your own custom function for +splitting the labelled data into training data and validation data in +the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction +package. This vignette assumes you have read and are comfortable with +building single patient level prediction models as described in the BuildingPredictiveModels +vignette.

+

We invite you to share your new data splitting functions with +the OHDSI community through our GitHub +repository.

-
-

-Data Splitting Function Code Structure

-

To make a custom data splitting function that can be used within PatientLevelPrediction you need to write two different functions. The ‘create’ function and the ‘implement’ function.

-

The ‘create’ function, e.g., create<DataSplittingFunction>, takes the parameters of the data splitting ‘implement’ function as input, checks these are valid and outputs these as a list of class ‘splitSettings’ with the ‘fun’ attribute specifying the ‘implement’ function to call.

-

The ‘implement’ function, e.g., implement<DataSplittingFunction>, must take as input: * population: a data frame that contain rowId (patient identifier), ageYear, gender and outcomeCount (the class labels) * splitSettings - the output of your create<DataSplittingFunction>

-

The ‘implement’ function then needs to implement code to assign each rowId in the population to a splitId (<0 means in the train data, 0 means not used and >0 means in the training data with the value defining the cross validation fold).

+
+

Data Splitting Function Code Structure +

+

To make a custom data splitting function that can be used within +PatientLevelPrediction you need to write two different functions. The +‘create’ function and the ‘implement’ function.

+

The ‘create’ function, e.g., create<DataSplittingFunction>, +takes the parameters of the data splitting ‘implement’ function as +input, checks these are valid and outputs these as a list of class +‘splitSettings’ with the ‘fun’ attribute specifying the ‘implement’ +function to call.

+

The ‘implement’ function, e.g., +implement<DataSplittingFunction>, must take as input: * +population: a data frame that contain rowId (patient identifier), +ageYear, gender and outcomeCount (the class labels) * splitSettings - +the output of your create<DataSplittingFunction>

+

The ‘implement’ function then needs to implement code to assign each +rowId in the population to a splitId (<0 means in the train data, 0 +means not used and >0 means in the training data with the value +defining the cross validation fold).

-
-

-Example

-

Let’s consider the situation where we wish to create a split where females are used to train a model but males are used to evaluate the model.

-
-

-Create function

-

Our gender split function requires a single parameter, the number of folds used in cross validation. Therefore create a function with a single nfold input that returns a list of class ‘splitSettings’ with the ‘fun’ attribute specifying the ‘implement’ function we will use.

+
+

Example +

+

Let’s consider the situation where we wish to create a split where +females are used to train a model but males are used to evaluate the +model.

+
+

Create function +

+

Our gender split function requires a single parameter, the number of +folds used in cross validation. Therefore create a function with a +single nfold input that returns a list of class ‘splitSettings’ with the +‘fun’ attribute specifying the ‘implement’ function we will use.

-createGenderSplit <- function(nfold)
-  {
-  
-  # create list of inputs to implement function
-  splitSettings <- list(nfold = nfold)
-  
-  # specify the function that will implement the sampling
-  attr(splitSettings, "fun") <- "implementGenderSplit"
-
-  # make sure the object returned is of class "sampleSettings"
-  class(splitSettings) <- "splitSettings"
-  return(splitSettings)
-  
-}
-

We now need to create the ‘implement’ function implementGenderSplit()

+createGenderSplit <- function(nfold) + { + + # create list of inputs to implement function + splitSettings <- list(nfold = nfold) + + # specify the function that will implement the sampling + attr(splitSettings, "fun") <- "implementGenderSplit" + + # make sure the object returned is of class "sampleSettings" + class(splitSettings) <- "splitSettings" + return(splitSettings) + +}
+

We now need to create the ‘implement’ function +implementGenderSplit()

-
-

-Implement function

-

All ‘implement’ functions for data splitting must take as input the population and the splitSettings (this is the output of the ‘create’ function). They must return a data.frame containing columns: rowId and index.

-

The index is used to determine whether the patient (identifed by the rowId) is in the test set (index = -1) or train set (index > 0). In in the train set, the value corresponds to the cross validation fold. For example, if rowId 2 is assigned index 5, then it means the patient with the rowId 2 is used to train the model and is in fold 5.

+
+

Implement function +

+

All ‘implement’ functions for data splitting must take as input the +population and the splitSettings (this is the output of the ‘create’ +function). They must return a data.frame containing columns: rowId and +index.

+

The index is used to determine whether the patient (identifed by the +rowId) is in the test set (index = -1) or train set (index > 0). In +in the train set, the value corresponds to the cross validation fold. +For example, if rowId 2 is assigned index 5, then it means the patient +with the rowId 2 is used to train the model and is in fold 5.

-implementGenderSplit <- function(population, splitSettings){
-
-  # find the people who are male:
-  males <- population$rowId[population$gender == 8507]
-  females <- population$rowId[population$gender == 8532]
-  
-  splitIds <- data.frame(
-    rowId = c(males, females),
-    index = c(
-      rep(-1, length(males)),
-      sample(1:splitSettings$nfold, length(females), replace = T)
-    )
-  )
-  
-  # return the updated trainData
-  return(splitIds)
-}
+implementGenderSplit <- function(population, splitSettings){ + + # find the people who are male: + males <- population$rowId[population$gender == 8507] + females <- population$rowId[population$gender == 8532] + + splitIds <- data.frame( + rowId = c(males, females), + index = c( + rep(-1, length(males)), + sample(1:splitSettings$nfold, length(females), replace = T) + ) + ) + + # return the updated trainData + return(splitIds) +}
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

+

This work is supported in part through the National Science +Foundation grant IIS 1251151.

@@ -257,11 +312,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -270,5 +327,7 @@

+ + diff --git a/docs/articles/AddingCustomSplitting_files/header-attrs-2.11/header-attrs.js b/docs/articles/AddingCustomSplitting_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/AddingCustomSplitting_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/AddingExistingModels.html b/docs/articles/AddingExistingModels.html deleted file mode 100644 index 6e493db0d..000000000 --- a/docs/articles/AddingExistingModels.html +++ /dev/null @@ -1,362 +0,0 @@ - - - - - - - -Implementing Existing Prediction Models using the OHDSI PatientLevelPrediction framework • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - - -
-

-Introduction

-

This vignette describes how you can implement existing logistic regression models in the PatientLevelPrediction framework. This allows you to for example externally validate them at scale in the OHDSI data network.

-

As an example we are going to implement the CHADS2 model:

-

Gage BF, Waterman AD, Shannon W, Boechler M, Rich MW, Radford MJ. Validation of clinical classification schemes for predicting stroke: results from the National Registry of Atrial Fibrillation. JAMA. 2001 Jun 13;285(22):2864-70

-

To implement the model you need to create three tables: the model table, the covariate table, and the intercept table. The model table specifies the modelId (sequence number), the modelCovariateId (sequence number) and the covariateValue (beta for the covariate). The covariate table specifies the mapping between the covariates from the published model and the standard Patient Level Prediction framework covariates, i.e. its maps to a combination of an analysisid and a concept_id (see below). The intercept table specifies per modelId the intercept.

-
-
-

-Model implementation

-
-

-Define the model

-

The CHADS2 is a score based model with:

-
##   Points                        Covariate
-## 1      1         Congestive heart failure
-## 2      1                     Hypertension
-## 3      1                  Age >= 75 years
-## 4      1                Diabetes mellitus
-## 5      2 Stroke/transient ischemic attack
-

The model table should therefore be defined as:

-
##   modelId modelCovariateId covariateValue
-## 1       1                1              1
-## 2       1                2              1
-## 3       1                3              1
-## 4       1                4              1
-## 5       1                5              2
-

The covariateTable will then specify what standard covariates need to be included in the model.

-

In this case we choose the following Standard SNOMED concept_ids: 319835 for congestive heart failure, 316866 for hypertensive disorder, 201820 for diabetes, and 381591 for cerebrovascular disease. It is allowed to add multiple concept_ids as seperate rows for the same modelCovariateId if concept sets are needed. These concept_ids can be found using the vocabulary search in ATLAS.

-

The Patient Level Prediction standard covariates are of the form: conceptid*1000 + analysisid. The analysisid specifies the domain of the covariate and its lookback window. Examples can be found here: https://github.com/OHDSI/FeatureExtraction/blob/master/inst/csv/PrespecAnalyses.csv

-

Our example of CHADS2 uses agegroup and conditions in the full history. Therefore we need to define the standard covariates using the FeatureExtraction::createCovariateSettings as follows:

-
library(PatientLevelPrediction)
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = T,                             
-                                                     useConditionOccurrenceLongTerm = T,
-                                                     includedCovariateIds = c(),
-                                                     longTermStartDays = -9999, 
-                                                     endDays = 0)
-

In the above code we used the useConditionOccurrenceLongTerm (these have an analysis id of 102) and we defined the longTermStartDays to be -9999 days relative to index (so we get the full history). We include the index date in our lookback period by specifying endDays = 0. The includeCovariateIds is set to 0, but this will be updated when you run the next code to pick out the standard covariates of interest. As we picked analysis id 102, the standard covariate for anytime prior congestive heart failure is 319835102, the same logic follows for the other conditions, so the covariate table will be:

-
##   modelCovariateId covariateId
-## 1                1   319835102
-## 2                2   316866102
-## 3                3       15003
-## 4                3       16003
-## 5                3       17003
-## 6                3       18003
-## 7                3       19003
-## 8                4   201820102
-## 9                5   381591102
-

modelCovariateId 3 was age>= 75, as the standard covariate age groups are in 5 year groups, we needed to add the age groups 75-80, 80-85, 85-90, 90-95 and 95-100, these correspond to the covaraiteIds 15003, 16003, 17003, 18003 and 19003 respectively.

-

To create the tables in R for CHADS2 you need to make the following dataframes:

-
model_table <- data.frame(modelId = c(1,1,1,1,1),
-                          modelCovariateId = 1:5, 
-                          coefficientValue = c(1, 1, 1, 1, 2)
-                          )
-
-covariate_table <- data.frame(modelCovariateId = c(1,2,3,3,3,3,3,4,5),
-                              covariateId = c(319835102, 316866102, 
-                                            15003, 16003, 17003, 18003, 19003, 
-                                            201820102, 381591102)
-                              )
-
-interceptTable <-  data.frame(modelId = 1, 
-                              interceptValue = 0)
-
-
-

-Create the model

-

Now you have everything in place actually create the existing model. First specify the current environment as executing createExistingModelSql creates two functions for running the existing model into the specificed environment. Next a few additional settings are needed: as some models require an intercept, there is an option for this (set it to 0 if an intercept isn’t needed), also the type specifies the final mapping (either logistic or linear/score), in our example we are calculating a score. We finally need to specify the analysisId for the newly created CHADS2 covariate.

-
e <- environment()
-PatientLevelPrediction::createExistingModelSql(modelTable = model_table, 
-                       modelNames = 'CHADS2', 
-                       interceptTable = data.frame(modelId = 1, interceptValue = 0),
-                       covariateTable = covariate_table, 
-                       type = 'score',
-                       analysisId = 112, covariateSettings = covSettings, e = e)
-

Once run you will find two new functions in your environment:

-
    -
  • createExistingmodelsCovariateSettings()
  • -
  • getExistingmodelsCovariateSettings()
  • -
-
-
-
-

-Run the model

-

Now you can use the functions you previously created to extract the existing model risk scores for a target population:

-
plpData <- PatientLevelPrediction::getPlpData(connectionDetails, 
-                      cdmDatabaseSchema = 'databasename.dbo',
-                      cohortId = 1,
-                      outcomeIds = 2, 
-                      cohortDatabaseSchema = 'databasename.dbo', 
-                      cohortTable =  'cohort' , 
-                      outcomeDatabaseSchema = 'databasename.dbo', 
-                      outcomeTable = 'cohort', 
-                      covariateSettings =  createExistingmodelsCovariateSettings(),
-                      sampleSize = 20000
-                      )
-

To implement and evaluate an existing model you can use the function:

-

PatientLevelPrediction::evaluateExistingModel()

-

with the following parameters:

-
    -
  • modelTable - a data.frame containing the columns: modelId, covariateId and coefficientValue
  • -
  • covariateTable - a data.frame containing the columns: covariateId and standardCovariateId - this provides a set of standardCovariateId to define each model covariate.
  • -
  • interceptTable - a data.frame containing the columns modelId and interceptValue or NULL if the model doesn’t have an intercept (equal to zero).
  • -
  • type - the type of model (either: score or logistic)
  • -
  • covariateSettings - this is used to determine the startDay and endDay for the standard covariates
  • -
  • customCovariates - a data.frame with the covariateId and sql to generate the covariate value.
  • -
  • riskWindowStart - the time at risk starts at target cohort start date + riskWindowStart
  • -
  • addExposureDaysToEnd - if true then the time at risk window ends a the cohort end date + riskWindowEnd rather than cohort start date + riskWindowEnd
  • -
  • riskWindowEnd - the time at risk ends at target cohort start/end date + riskWindowStart
  • -
  • requireTimeAtRisk - whether to add a constraint to the number of days observed during the time at risk period in including people into the study
  • -
  • minTimeAtRisk - the minimum number of days observation during the time at risk a target population person needs to be included
  • -
  • includeAllOutcomes - Include outcomes even if they do not satisfy the minTimeAtRisk? (useful if the outcome is associated to death or rare)
  • -
  • removeSubjectsWithPriorOutcome - remove target population people who have the outcome prior to the time at tisk period?
  • -
  • connectionDetails - the connection to the CDM database
  • -
-

Finally you need to add the settings for downloading the new data:

-
    -
  • cdmDatabaseSchema
  • -
  • cohortDatabaseSchema
  • -
  • cohortTable
  • -
  • cohortId
  • -
  • outcomeDatabaseSchema
  • -
  • outcomeTable
  • -
  • outcomeId
  • -
  • oracleTempSchema
  • -
-

To run the external validation of an existing model where the target population are those in the cohort table with id 1 and the outcome is those in the cohort table with id 2 and we are looking to predict first time occurrance of the outcome 1 day to 365 days after the target cohort start date (asusming you have the modelTable, covariateTable and interceptTable in the format explained above):

-
# if the existing model uses gender and condition groups looking back 200 days:
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsGender = T,
-                                                     useConditionGroupEraMediumTerm = T, 
-                                                     mediumTermStartDays = -200)
-result <- evaluateExistingModel(modelTable = modelTable,
-                                covariateTable = covariateTable,
-                                interceptTable = NULL,
-                                type = 'score', 
-                                covariateSettings =  covSet,
-                                riskWindowStart = 1, 
-                                addExposureDaysToEnd = F, 
-                                riskWindowEnd = 365, 
-                                requireTimeAtRisk = T, 
-                                minTimeAtRisk = 364, 
-                                includeAllOutcomes = T, 
-                                removeSubjectsWithPriorOutcome = T, 
-                                connectionDetails = connectionDetails, 
-                                cdmDatabaseSchema = 'databasename.dbo',
-                                cohortId = 1,
-                                outcomeId = 2, 
-                                cohortDatabaseSchema = 'databasename.dbo', 
-                                cohortTable =  'cohort' , 
-                                outcomeDatabaseSchema = 'databasename.dbo', 
-                                outcomeTable = 'cohort'
-                      )
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018).
-## "Design and implementation of a standardized framework to generate
-## and evaluate patient-level prediction models using observational
-## healthcare data." _Journal of the American Medical Informatics
-## Association_, *25*(8), 969-975. <URL:
-## https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - -
- -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - diff --git a/docs/articles/BenchmarkTasks.html b/docs/articles/BenchmarkTasks.html new file mode 100644 index 000000000..423651309 --- /dev/null +++ b/docs/articles/BenchmarkTasks.html @@ -0,0 +1,343 @@ + + + + + + + +Benchmark Tasks • PatientLevelPrediction + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + + +
+

Benchmark Tasks For Large-Scale Empirical Analyses +

+

Here we provide a set of diverse prediction tasks that can be used +when evaluating the impact of the model design choice when developing +models using observational data.

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Target Cohort (index)OutcomeTime-at-riskLink
Patients with an outpatient visit in 2017 with no prior cancer +(first visit in 2017)Lung cancer1 day - 3 years after index
Patients newly diagnosed with major depressive disorder (date of +first record)Bipolar1 day - 365 day after index
Patients with an outpatient visit in 2019Dementia1 day - 3 years after index
Patients with an outpatient visit and a positive COVID testHospitalization with pneumonia1 day - 30 days after index
Patients with an outpatient visit and a positive COVID testHospitalization with pneumonia that required intensive services +(ventilation, intubation, tracheotomy, or extracorporeal membrane +oxygenation) or death1 day - 30 days after index
Patients with an outpatient visit and a positive COVID testDeath1 day - 30 days after index
Patients with T2DM who were treated with metformin and who became +new adult users of one of sulfonylureas, thiazolidinediones, dipeptidyl +peptidase-4 inhibitors, glucagon-like peptide-1 receptor agonists, or +sodium-glucose co-transporter-2 inhibitors (date of secondary drug). +Patients with HF or patients treated with insulin on or prior to the +index date were excluded from the analysis. Patients were required to +have been enrolled for at least 365 days before cohort entry.Heart Failure1 to 365 days
Patients newly diagnosed with atrial fibrilation (date of initial +afib record)Ischemic stroke1 to 365 days
Patients undergoing elective major non-cardiac surgery (date of +surgery). Patients were required to have been enrolled for at least 365 +days before cohort entry.Earliest of AMI cardiac arrest or death (MACE)O to 30 days
Patients starting intravitreal Anti-VEGF (date of +administration)Kidney Failure1 to 365 days
Pregnancy women (start of pregnancy)PreeclampsiaDuring pregnancy
Pregnancy women (start of pregnancy)Still birthDuring pregnancy
Patients with COPD (first record)Cardiovascular event and death1-30 days and 1-90 days
Patients starting menopause (first record)Depression1 day - 3-years
Patients with anemia (date of first anemia record)Colorectal cancer1 day - 1-year
Patients with quadriplegia (date of first quadriplegia record)Death1 day - 1-year
Patient undergoing
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.7.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/BestPractices.html b/docs/articles/BestPractices.html index c0490c754..49fc4e02f 100644 --- a/docs/articles/BestPractices.html +++ b/docs/articles/BestPractices.html @@ -19,6 +19,8 @@ + +
+
@@ -149,9 +164,10 @@

2022-03-09

%\VignetteEngine{knitr} %\VignetteIndexEntry{Best Practices} --> -
-

-Best practice publications using the OHDSI PatientLevelPrediction framework

+
+

Best practice publications using the OHDSI PatientLevelPrediction +framework +

@@ -194,7 +211,8 @@

Addressing loss to follow-up (right censoring)

@@ -205,7 +223,8 @@

Investigating how to address left censoring in features construction

@@ -216,7 +235,8 @@

Impact of over/under-sampling

@@ -238,7 +258,8 @@

How much data do we need for prediction - Learning curves at scale

@@ -249,7 +270,7 @@

What impact does test/train/validation design have on model performance

@@ -260,7 +281,7 @@

What is the impact of the classifier

@@ -268,7 +289,8 @@

Model development

+ + + + + + + + + + + + + + + + + + + + + + + + + @@ -293,7 +378,8 @@

How should we present model performance? (e.g., new visualizations)

@@ -301,7 +387,8 @@

Evaluation

@@ -183,7 +199,8 @@

Comparison of cohort vs case-control design

-Journal of Big Data +Journal +of Big Data
-BMC medical informatics and decision makingk +BMC +medical informatics and decision makingk
-BMC Medical Research Methodology +BMC +Medical Research Methodology
-Study being developed + +Journal of big data
-Preprint link +International +Journal of Medical Informatics
-BMJ Open +BMJ Open
-JAMIA +JAMIA
-Can we find hyper-parameter combinations per classifier that consistently lead to good performing models when using claims/EHR data? +Can we find hyper-parameter combinations per classifier that +consistently lead to good performing models when using claims/EHR data? Study needs to be done @@ -279,10 +301,73 @@

Model development

-Can we use ensembles to combine models developed using different databases to improve models transportability? +Can we use ensembles to combine different algorithm models within a +database to improve models transportability? - Paper under review at BMC + Caring is +Sharing–Exploiting the Value in Data for Health and Innovation +
+Model development + +Can we use ensembles to combine models developed using different +databases to improve models transportability? + + +BMC Medical Informatics and Decision Making +
+Model development + +Impact of regularization method + + +JAMIA +
+Evaluation + +Why prediction is not suitable for risk factor identification + + Machine +Learning for Healthcare Conference +
+Evaluation + +Iterative pairwise external validation to put validation into context + + +Drug Safety +
+Evaluation + +A novel method to estimate external validation using aggregate +statistics + + Study under review
-JAMIA Open +JAMIA +Open
-How to interpret external validation performance (can we figure out why the performance drops or stays consistent)? +How to interpret external validation performance (can we figure out why +the performance drops or stays consistent)? Study needs to be done @@ -326,7 +413,8 @@

Is there a way to automatically simplify models?

-Study protocol under development +Study +protocol under development
@@ -344,11 +432,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -357,5 +447,7 @@

+ + diff --git a/docs/articles/BestPractices_files/header-attrs-2.11/header-attrs.js b/docs/articles/BestPractices_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BestPractices_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BestPractices_files/header-attrs-2.7/header-attrs.js b/docs/articles/BestPractices_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BestPractices_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingDeepLearningModels.html b/docs/articles/BuildingDeepLearningModels.html deleted file mode 100644 index 062ff4614..000000000 --- a/docs/articles/BuildingDeepLearningModels.html +++ /dev/null @@ -1,597 +0,0 @@ - - - - - - - -Building Deep Learning Models • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

Electronic Health Records (EHR) data is high dimensional, heterogeneous, and sparse, which makes predictive modelling a challenge. In the early days, the machine learning community mainly focused on algorithm development, currently there is a shift to more powerful feature engineering. Deep Learning models are widely used to automatically learn high-level feature representations from the data, and have achieved remarkable results in image processing, speech recognition and computational biology. Recently, interesting results have been shown using EHRs, but more extensive research is needed to assess the power of Deep Learning in this domain.

-

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package to build Deep Learning models. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette. Furthermore, this vignette assumes you are familiar with Deep Learning methods.

-
-
-

-Background

-

Deep Learning models are build by stacking an often large number of neural network layers that perform feature engineering steps, e.g embedding, and are collapsed in a final softmax layer (basically a logistic regression layer). These algorithms need a lot of data to converge to a good representation, but currently the sizes of the EHR databases are growing fast which would make Deep Learning an interesting approach to test within OHDSI’s Patient-Level Prediction Framework. The current implementation allows us to perform research at scale on the value and limitations of Deep Learning using EHR data. For relatively small Target and Outcome cohorts, Deep Learning is most probably not the best choice.

-

Most current Deep Learning research is performed in python and we have developed a pipeline to interact with python. Multiple Deep Learning backends have been developed, e.g. Tensorflow, PyTorch, Keras (recently also available in R) etc. In the package we have implemented interaction with Keras in R and PyTorch in Python but we invite the community to add other backends.

-

Many network architectures have recently been proposed and we have implemented a number of them, however, this list will grow in the near future. It is important to understand that some of these architectures require a 2D data matrix, i.e. |patient|x|feature|, and others use a 3D data matrix |patient|x|feature|x|time|. The FeatureExtraction Package has been extended to enable the extraction of both data formats as will be described with examples below.

-

Note that training Deep Learning models is computationally intensive, our implementation therefore supports both GPU and CPU. It will automatically check whether there is GPU or not in your computer. A GPU is highly recommended for Deep Learning!

-
-
-

-Non-Temporal Architectures

-

We implemented the following non-temporal (2D data matrix) architectures using PyTorch:

-
1) Logistics regression (LRTorch)
-   A simple softmax layer with l2 regularization
-
-2) Feed forward network (MLPTorch) 
-   Supports multilayer perceptron (mlp_type = MLP) and 
-   Self-Normalizing Neural Networks (mlp_type = SNN)
-   Reference: https://arxiv.org/abs/1706.02515
-

For the above two methods, we implemented support for a stacked autoencoder and a variational autoencoder to reduce the feature dimension as a first step. These autoencoders learn efficient data encodings in an unsupervised manner by stacking multiple layers in a neural network. Compared to the standard implementations of LR and MLP these implementations can use the GPU power to speed up the gradient descent approach in the back propagation to optimize the weights of the classifier.

-

Table 1: Non-Temporal Deep Learning Models Hyper-Parameters

- ----- - - - - - - - - - - - - - - - - - -
NameDescriptionHyper-parameters
LRTorchLogistic Regression Modelw_decay (l2 regularization), epochs (number of epochs), class_weight (0 = inverse ratio between number of positive and negative examples, -1 = focal loss (https://arxiv.org/abs/1708.02002), or other), autoencoder (apply stacked autoencoder?, vae (apply variational autoencoder)
MLPTorchMulti-Layer Perceptron Modelmlp_type (MLP = default, SNN = self-normalizing neural network), size (number of hidden nodes), w_decay (l2 regularization), epochs (number of epochs), class_weight(0 = inverse ratio between number of positive and negative examples, -1 = focal loss, or other), autoencoder (apply stacked autoencoder), vae (apply variational autoencoder?)
-

##Example The approach for logistic regression (LRTorch) and the Multi-Layer Perceptron (MLPTorch) is identical. Here we will take LRTorch as an example.

-

You need to generate a population and plpData object as described in more detail in BuildingPredictiveModels vignette.

-

Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
-set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = "INFO"
-)
-

As an example we will build a LRTorch model. We could specify the stacked autoencoder or the variational autoencoder to be used for reducing the feature dimension as an initial layer, but for this example we do not.

-
-autoencoder <- FALSE
-vae <- FALSE
-

We added a class_weight for imbalanced data, the default value 0 is the inverse ratio between negatives and positives,-1 applies focal loss.

-
-class_weight <- 0
-
-# Specify the settings for Logistics regression model using Torch in Python
-model <- setLRTorch(autoencoder=autoencoder, vae=vae,  class_weight=class_weight)
-

No we define our modelling parameters.

-
-testFraction <- 0.2
-testSplit <- 'person'
-nfold <- 3
-splitSeed <- 1000
-

And we train and internally validate the model.

-
-results <- PatientLevelPrediction::runPlp(population = population, 
-                                          plpData = plpData, 
-                                          modelSettings = model,
-                                          testSplit=testSplit,
-                                          testFraction=testFraction,
-                                          nfold=nfold, 
-                                          splitSeed=splitSeed) 
-
-
-

-Temporal Architectures

-

Several architectures are implemented that can handle temporal data in PyTorch and R Keras.

-
-

-PyTorch CNN

-

We implemented the following convolutional models described in https://github.com/clinicalml/deepDiagnosis in CNNTorch:

-
    -
  1. -
    -Temporal Convolutional neural network over a backward window (type = cnn) -
    -
    - -
    -
  2. -
  3. -
    -Convolutional neural network over input and time dimension (type = mix) -
    -
    - -
    -
  4. -
  5. -
    -Multi-resolution temporal convolutional neural network (type = multi) -
    -
    - -
    -
  6. -
-

Furthermore, we added the following achitectures:

-
    -
  1. -
    -CNN with filters with three different parallel kernel sizes (3,4,5) and a fully connected layers (type = mlf) -
    -
    - -
    -
  2. -
  3. -
    -LSTM network over the backward window (type = lstm) -
    -
    - -
    -
  4. -
  5. -
    -Residual Learning Network as described in: https://arxiv.org/abs/1512.03385 (type = resnet) -
    -
    -This a very big network, see the paper for the topology. -
    -
  6. -
- ---- - - - - - - - - - - - - - - - - - - - - - - -
parameterdescription
nbfiltersThe number of convolution filters
epochsThe number of epochs
seedRandom seed
class_weightThe class weight used for imbalanced data
(0: Inverse ratio between positives and negatives, -1: Focal loss, or number)
-
-
-

-PyTorch RNN

-

The following recurrent neural network models are implemented in RNNTorch:

-
    -
  1. -
    -RNN with one LSTM layer fed into one fully connected layer (type = RNN) -
    -
    -
    - -
    - -
  2. -
  3. -
    -RNN with one bidirectional LSTM layer fed into one fully connected layer (type = BiRNN) -
    -
    -This network looks the same as above but then as a bi-directional version -
    -
  4. -
  5. -
    -One Gated Recurrent Unit layer fed into one fully connected layers (type = GRU) -
    -
    -This network looks the same as above but then implemented as GRU -
    -
  6. -
-

The following hyper-parameters can be set for these PyTorch models:

- ---- - - - - - - - - - - - - - - - - - - - - - - -
parameterdescription
hidden_sizeThe number of features in hidden state
epochsThe number of epochs
seedRandom seed
class_weightThe class weight used for imbalanced data
(0: Inverse ratio between positives and negatives, -1: Focal loss, or number)
-
-
-
-

-R Keras CNN

-

The following temporal architectures as described in https://arxiv.org/pdf/1608.00647.pdf were implemented using R Keras:

-
    -
  1. -
    -Multi-resolution CovNN model (CovNN.R) -
    -
    - -
    -
  2. -
  3. -
    -
    -Convolution across data and time according(CovNN2.R) -
    -
    - -
    -
    -
    -
  4. -
-

Furthermore, a custom build RNN is added that uses a variational autoencoder.

-
    -
  1. -
    -Clinically Informing application based on Recurrent Neural Network (CIReNN.R) -
    -
    - -
    -
  2. -
-

Table 2: Temporal Deep Learning Models

- ---- - - - - - - - - - - - - - - - - - - -
ModelHyper-parameters
CovNNbatchSize (The number of samples to used in each batch during model training), outcomeWeight (The weight assigned to the outcome), lr (The learning rate), decay (The decay of the learning rate), dropout ([currently not used] the dropout rate for regularization), epochs (The number of times data is used to train the model, e.g., epoches=1 means data only used once to train), filters (The number of columns output by each convolution), kernelSize (The number of time dimensions used for each convolution), loss (The loss function implemented), seed (The random seed)
CovNN2batchSize (The number of samples to used in each batch during model training), outcomeWeight (The weight assigned to the outcome), lr (The learning rate), decay (The decay of the learning rate), dropout ([currently not used] the dropout rate for regularization), epochs (The number of times data is used to train the model, e.g., epoches=1 means data only used once to train), filters (The number of columns output by each convolution), kernelSize (The number of time dimensions used for each convolution), loss (The loss function implemented), seed (The random seed)
CIReNNunits (The number of units of RNN layer - as a list of vectors), recurrentDropout (The reccurrent dropout rate), layerDropout (The layer dropout rate), lr (Learning rate), decay (Learning rate decay over each update), outcomeWeight (The weight of the outcome class in the loss function), batchSize (The number of data points to use per training batch), epochs (Number of times to iterate over data set), earlyStoppingMinDelta (Minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement), earlyStoppingPatience (Number of epochs with no improvement after which training will be stopped), seed (Random seed used by Deep Learning model)
-
-
-

-Example

-

We will now show how to use the temporal models by using CNNTorch as an example.

-

You need to generate a population and plpData object as described in more detail in BuildingPredictiveModels vignette.

-

Note that for these algorithms you need to extracted temporal data as described in the [FeatureExtraction vignette] (https://github.com/OHDSI/FeatureExtraction/blob/master/inst/doc/UsingFeatureExtraction.pdf) as follows:

-
-settings <- createTemporalCovariateSettings(useConditionEraStart = FALSE,
-                                            useConditionEraOverlap = FALSE,
-                                            useConditionOccurrence = FALSE,
-                                            useConditionEraGroupStart = FALSE,
-                                            useConditionEraGroupOverlap = FALSE,
-                                            useDrugExposure = FALSE,
-                                            useDrugEraStart = FALSE,
-                                            useDrugEraOverlap = FALSE,
-                                            useMeasurement = FALSE,
-                                            useMeasurementValue = TRUE,
-                                            useMeasurementRangeGroup = FALSE,
-                                            useProcedureOccurrence = FALSE,
-                                            useDeviceExposure = FALSE,
-                                            useObservation = FALSE,
-                                            excludedCovariateConceptIds = c(316866),
-                                            addDescendantsToExclude = TRUE,
-                                            temporalStartDays = seq(from = -365, 
-                                                                    to = -1, by = 12), 
-                                            temporalEndDays = c(seq(from = -353, 
-                                                                    to = 0, by = 12), 0))
-
-plpData <- getPlpData(connectionDetails = connectionDetails,
-                        cdmDatabaseSchema = cdmDatabaseSchema,
-                        cohortDatabaseSchema = "results",
-                        cohortTable = "cohort",
-                        cohortId = 11,
-                        covariateSettings = settings,
-                        outcomeDatabaseSchema = resultsDatabaseSchema,
-                        outcomeTable = "cohort",
-                        outcomeIds = 25,
-                        cdmVersion = 5)
-

Each CNN/RNN has several hyper-parameters that can be set as shown in the Tables above, but for this example we take the defaults.

-
-# specify the the CNN
-model <- setCNNTorch(cnn_type='CNN')
-

Run the model training, for example with a testFraction = 0.2 and a split by person:

-
-results <- PatientLevelPrediction::runPlp(population, plpData, model,
-                                          testSplit='person',
-                                          testFraction=0.2,
-                                          nfold=3, 
-                                          splitSeed=1000) 
-
-
-
-

-Apply the trained Deep Learning model

-

Applying a Deep Learning is identical to the other models in the package:

-
-# load the trained model
-plpModel <- loadPlpModel(getwd(), "<your model>")
-
-# load the new plpData (should have the same temporal features!) and create the population
-plpData <- loadPlpData(getwd(), "<your data>")
-
-populationSettings <- plpModel$populationSettings
-populationSettings$plpData <- plpData
-population <- do.call(createStudyPopulation, populationSettings)  
-
-# apply the trained model on the new data
-validationResults <- applyModel(population, plpData, plpModel)
-
-
-

-Adding new architectures

-

It is possible to add new architectures in our framework using PyTorch or R Keras. We are happy to help you with this, please post your questions on the issue tracker of the package.

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - -
- -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - diff --git a/docs/articles/BuildingDeepLearningModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingDeepLearningModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingDeepLearningModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingEnsembleModels.html b/docs/articles/BuildingEnsembleModels.html deleted file mode 100644 index 8cbfd2eb8..000000000 --- a/docs/articles/BuildingEnsembleModels.html +++ /dev/null @@ -1,316 +0,0 @@ - - - - - - - -Building Ensemble Models • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

Ensemble models combine several models to improve the overall performance. Traditionally, weak learners were combined to boost performance but recent results show that combining several strong approaches can also result in a better performance. There are many examples in literature where ensemble models outperform individual models using stacking, i.e. a final logistic regresssion layer accross the individual model outputs, but other approaches like weigthing has also shown promising results.

-

This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package to build ensemble models. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

This will enable studying ensemble methods at scale in the OHDSI data network.

-
-

Ensemble model

-
-

In PatientLevelPrediction package, four ensemble strategies have been implemented:

-
    -
  1. average ensemble: Calculate the average probability from individual models
  2. -
  3. product ensemble: Calculate the product of probabilites from individual models.
  4. -
  5. weighted ensemble: Calculate the weighted average probability from individual models using train AUC as weights.
  6. -
  7. stacked ensemble: Train a logistics regression on outputs from individual models
  8. -
-
-
-

-Usage

-

Use the PatientLevelPrediction package to generate a population and plpData object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
-data(plpDataSimulationProfile)
-set.seed(1234)
-sampleSize <- 2000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = "INFO"
-)
-

Specify the prediction algorithms to be combined.

-
-# Use LASSO logistic regression and Random Forest as base predictors
-model1 <- setLassoLogisticRegression()
-model2 <- setRandomForest()
-

Specify a test fraction and a sequence of training set fractions.

-
-testFraction <- 0.2
-

Specify an ensembleStrategy to combine multiple predictors. The strategy used for ensembling the outputs from different models, it can be ‘mean’, ‘product’, ‘weighted’ and ‘stacked’: ‘mean’ the average probability from differnt models ‘product’ the product rule ‘weighted’ the weighted average probability from different models using train AUC as weights. ‘stacked’ the stakced ensemble trains a logistics regression on different models.

-
-ensembleStrategy <- 'stacked'
-

Specify the test split to be used.

-
-# Use a split by person, alterantively a time split is possible
-testSplit <- 'person'
-

Run the ensemble learning to combine model1 and model2. You can also use different plpData for different models.

-
-ensembleResults <- PatientLevelPrediction::runEnsembleModel(population, 
-                                   dataList = list(plpData, plpData), 
-                                   modelList = list(model1, model2),
-                                   testSplit=testSplit,
-                                   testFraction=testFraction,
-                                   nfold=3, splitSeed=1000, 
-                                   ensembleStrategy = ensembleStrategy) 
-
-

-Saving and loading the ensemble model

-

You can save and load the model using:

-
-saveEnsemblePlpModel(ensembleResults$model, dirPath = file.path(getwd(), "model"))
-ensembleModel <- loadEnsemblePlpModel(getwd(), "model")
-
-
-
-

-Apply Ensemble model

-
-plpData <- loadPlpData("<data file>")
-populationSettings <- ensembleModel$populationSettings
-populationSettings$plpData <- plpData
-population <- do.call(createStudyPopulation, populationSettings)
-

Load the model.

-
-ensembleModel <- loadEnsemblePlpModel("<model folder>")
-

Get the predictions by applying the model:

-
-prediction <- applyEnsembleModel(population,
-                                  dataList = list(plpData, plpData),
-                                  ensembleModel = ensembleModel)$prediction
-
-
-

-Demo

-

We have added a demo of the ensemble training:

-
-# Show all demos in our package: 
- demo(package = "PatientLevelPrediction")
-
-# Run the learning curve
- demo("EnsembleModelDemo", package = "PatientLevelPrediction")
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - -
- -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - diff --git a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/BuildingEnsembleModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingEnsembleModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingEnsembleModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingMultiplePredictiveModels.html b/docs/articles/BuildingMultiplePredictiveModels.html index e8e402763..45d404d03 100644 --- a/docs/articles/BuildingMultiplePredictiveModels.html +++ b/docs/articles/BuildingMultiplePredictiveModels.html @@ -19,6 +19,8 @@ + +
+
@@ -146,20 +162,39 @@

2022-03-09

-
-

-Introduction

-

In our paper, we propose a standardised framework for patient-level prediction that utilizes the OMOP CDM and standardized vocabularies, and describe the open-source software that we developed implementing the framework’s pipeline. The framework is the first to enforce existing best practice guidelines and will enable open dissemination of models that can be extensively validated across the network of OHDSI collaborators.

-

One our best practices is that we see the selection of models and all study setting as an emperical question, i.e. we should use a data-driven approach in which we try many settings. This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package to automatically build multiple patient-level predictive models, e.g. different population settings, covariate settings, and modelsetting. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the BuildingPredictiveModels vignette.

-

Note that it is also possible to generate a Study Package directly in Atlas that allows for multiple patient-level prediction analyses this is out-of-scope for this vignette.

+
+

Introduction +

+

In our paper, +we propose a standardised framework for patient-level prediction that +utilizes the OMOP CDM and standardized vocabularies, and describe the +open-source software that we developed implementing the framework’s +pipeline. The framework is the first to enforce existing best practice +guidelines and will enable open dissemination of models that can be +extensively validated across the network of OHDSI collaborators.

+

One our best practices is that we see the selection of models and all +study setting as an emperical question, i.e. we should use a data-driven +approach in which we try many settings. This vignette describes how you +can use the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction +package to automatically build multiple patient-level predictive models, +e.g. different population settings, covariate settings, and +modelsetting. This vignette assumes you have read and are comfortable +with building single patient level prediction models as described in the +BuildingPredictiveModels +vignette.

+

Note that it is also possible to generate a Study Package directly in +Atlas that allows for multiple patient-level prediction analyses this is +out-of-scope for this vignette.

-
-

-Creating a model design

-

The first step is to specify each model you wish to develop by using the createModelDesign function. This function requires the following:

+
+

Creating a model design +

+

The first step is to specify each model you wish to develop by using +the createModelDesign function. This function requires the +following:

@@ -181,275 +216,322 @@

- + - + - + - + - + - + - +
The inputs for the model design
restrictPlpDataSettingsThe settings used to restrict the target population, created with createRestrictPlpDataSettings()The settings used to restrict the target population, +created with createRestrictPlpDataSettings()
populationSettingsThe settings used to restrict the target population and create the outcome labels, created with createStudyPopulationSettings()The settings used to restrict the target population and +create the outcome labels, created with +createStudyPopulationSettings()
covariateSettingsThe settings used to define the covariates, created with FeatureExtraction::createDefaultCovariateSettings()The settings used to define the covariates, created +with FeatureExtraction::createDefaultCovariateSettings()
sampleSettingsThe settings used to define any under/over sampling, created with createSampleSettings()The settings used to define any under/over sampling, +created with createSampleSettings()
featureEngineeringSettingsThe settings used to define any feature engineering, created with createFeatureEngineeringSettings()The settings used to define any feature engineering, +created with createFeatureEngineeringSettings()
preprocessSettingsThe settings used to define any preprocessing, created with createPreprocessSettings()The settings used to define any preprocessing, created +with createPreprocessSettings()
modelSettingsThe settings used to define the model fitting settings, such as setLassoLogisticRegression()The settings used to define the model fitting settings, +such as setLassoLogisticRegression()
-
-

-Model design example 1

-

For example, if we wanted to predict the outcome (id 2) occuring for the first time within 180 days of the the target population index date (id 1). We are only interested in index dates betwrrn 2018-2020. Finally, we only want to use age, gender in 5 year buckets and conditions as features. The model can be specified by:

+
+

Model design example 1 +

+

For example, if we wanted to predict the outcome (id 2) occuring for +the first time within 180 days of the the target population index date +(id 1). We are only interested in index dates betwrrn 2018-2020. +Finally, we only want to use age, gender in 5 year buckets and +conditions as features. The model can be specified by:

-# Model 1 is only using data between 2018-2020:
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  studyStartDate = '20180101', 
-  studyEndDate = '20191231'
-  )
-
-# predict outcome within 1 to 180 days after index
-# remove people with outcome prior and with < 365 days observation
-populationSettings <- createStudyPopulationSettings(
-  binary = T, 
-  firstExposureOnly = T, 
-  washoutPeriod = 365, 
-  removeSubjectsWithPriorOutcome = T,
-  priorOutcomeLookback = 9999,
-  requireTimeAtRisk = F, 
-  riskWindowStart = 1, 
-  riskWindowEnd = 180
-)
-
-# use age/gender in groups and condition groups as features
-covariateSettings <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T, 
-  useDemographicsAgeGroup = T, 
-  useConditionGroupEraAnyTimePrior = T
-)
-
-modelDesign1 <- createModelDesign(
-  targetId = 1, 
-  outcomeId = 2, 
-  restrictPlpDataSettings = restrictPlpDataSettings, 
-  populationSettings = populationSettings, 
-  covariateSettings = covariateSettings, 
-  featureEngineeringSettings = createFeatureEngineeringSettings(),
-  sampleSettings = createSampleSettings(), 
-  preprocessSettings = createPreprocessSettings(), 
-  modelSettings = setLassoLogisticRegression()
-  )
+# Model 1 is only using data between 2018-2020: +restrictPlpDataSettings <- createRestrictPlpDataSettings( + studyStartDate = '20180101', + studyEndDate = '20191231' + ) + +# predict outcome within 1 to 180 days after index +# remove people with outcome prior and with < 365 days observation +populationSettings <- createStudyPopulationSettings( + binary = T, + firstExposureOnly = T, + washoutPeriod = 365, + removeSubjectsWithPriorOutcome = T, + priorOutcomeLookback = 9999, + requireTimeAtRisk = F, + riskWindowStart = 1, + riskWindowEnd = 180 +) + +# use age/gender in groups and condition groups as features +covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + useDemographicsAgeGroup = T, + useConditionGroupEraAnyTimePrior = T +) + +modelDesign1 <- createModelDesign( + targetId = 1, + outcomeId = 2, + restrictPlpDataSettings = restrictPlpDataSettings, + populationSettings = populationSettings, + covariateSettings = covariateSettings, + featureEngineeringSettings = createFeatureEngineeringSettings(), + sampleSettings = createSampleSettings(), + splitSettings = createDefaultSplitSetting(), + preprocessSettings = createPreprocessSettings(), + modelSettings = setLassoLogisticRegression() + )
-
-

-Model design example 2

-

For the second example, we want to predict the outcome (id 2) occuring for the first time within 730 days of the the target population index date (id 1). We want to train a random forest classifier. Finally, we want to use age, gender in 5 year buckets, drug ingredients (and groups) and conditions as features. The model can be specified by:

+
+

Model design example 2 +

+

For the second example, we want to predict the outcome (id 2) +occuring for the first time within 730 days of the the target population +index date (id 1). We want to train a random forest classifier. Finally, +we want to use age, gender in 5 year buckets, drug ingredients (and +groups) and conditions as features. The model can be specified by:

-# Model 2 has no restrictions when extracting data
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  )
-
-# predict outcome within 1 to 730 days after index
-# remove people with outcome prior and with < 365 days observation
-populationSettings <- createStudyPopulationSettings(
-  binary = T, 
-  firstExposureOnly = T, 
-  washoutPeriod = 365, 
-  removeSubjectsWithPriorOutcome = T,
-  priorOutcomeLookback = 9999,
-  requireTimeAtRisk = F, 
-  riskWindowStart = 1, 
-  riskWindowEnd = 730
-)
-
-# use age/gender in groups and condition/drug groups as features
-covariateSettings <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T, 
-  useDemographicsAgeGroup = T, 
-  useConditionGroupEraAnyTimePrior = T, 
-  useDrugGroupEraAnyTimePrior = T 
-)
-
-modelDesign2 <- createModelDesign(
-  targetId = 1, 
-  outcomeId = 2, 
-  restrictPlpDataSettings = restrictPlpDataSettings, 
-  populationSettings = populationSettings, 
-  covariateSettings = covariateSettings, 
-  featureEngineeringSettings = createRandomForestFeatureSelection(ntrees = 500, maxDepth = 7),
-  sampleSettings = createSampleSettings(), 
-  preprocessSettings = createPreprocessSettings(), 
-  modelSettings = setRandomForest()
-  )
+# Model 2 has no restrictions when extracting data +restrictPlpDataSettings <- createRestrictPlpDataSettings( + ) + +# predict outcome within 1 to 730 days after index +# remove people with outcome prior and with < 365 days observation +populationSettings <- createStudyPopulationSettings( + binary = T, + firstExposureOnly = T, + washoutPeriod = 365, + removeSubjectsWithPriorOutcome = T, + priorOutcomeLookback = 9999, + requireTimeAtRisk = F, + riskWindowStart = 1, + riskWindowEnd = 730 +) + +# use age/gender in groups and condition/drug groups as features +covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + useDemographicsAgeGroup = T, + useConditionGroupEraAnyTimePrior = T, + useDrugGroupEraAnyTimePrior = T +) + +modelDesign2 <- createModelDesign( + targetId = 1, + outcomeId = 2, + restrictPlpDataSettings = restrictPlpDataSettings, + populationSettings = populationSettings, + covariateSettings = covariateSettings, + featureEngineeringSettings = createRandomForestFeatureSelection(ntrees = 500, maxDepth = 7), + sampleSettings = createSampleSettings(), + splitSettings = createDefaultSplitSetting(), + preprocessSettings = createPreprocessSettings(), + modelSettings = setRandomForest() + )
-
-

-Model design example 3

-

For the third example, we want to predict the outcome (id 5) occuring during the cohort exposure of the the target population (id 1). We want to train a gradient boosting machine. Finally, we want to use age, gender in 5 year buckets and indications of measurements taken as features. The model can be specified by:

+
+

Model design example 3 +

+

For the third example, we want to predict the outcome (id 5) occuring +during the cohort exposure of the the target population (id 1). We want +to train a gradient boosting machine. Finally, we want to use age, +gender in 5 year buckets and indications of measurements taken as +features. The model can be specified by:

-# Model 3 has no restrictions when extracting data
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  )
-
-# predict outcome during target cohort start/end 
-# remove people with  < 365 days observation
-populationSettings <- createStudyPopulationSettings(
-  binary = T, 
-  firstExposureOnly = T, 
-  washoutPeriod = 365, 
-  removeSubjectsWithPriorOutcome = F,
-  requireTimeAtRisk = F, 
-  riskWindowStart = 0,
-  startAnchor =  'cohort start',
-  riskWindowEnd = 0, 
-  endAnchor = 'cohort end'
-)
-
-# use age/gender in groups and measurement indicators as features
-covariateSettings <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T, 
-  useDemographicsAgeGroup = T, 
-  useMeasurementAnyTimePrior = T,
-  endDays = -1
-)
-
-modelDesign3 <- createModelDesign(
-  targetId = 1, 
-  outcomeId = 5, 
-  restrictPlpDataSettings = restrictPlpDataSettings, 
-  populationSettings = populationSettings, 
-  covariateSettings = covariateSettings, 
-  featureEngineeringSettings = createFeatureEngineeringSettings(),
-  sampleSettings = createSampleSettings(), 
-  preprocessSettings = createPreprocessSettings(), 
-  modelSettings = setGradientBoostingMachine()
-  )
+# Model 3 has no restrictions when extracting data +restrictPlpDataSettings <- createRestrictPlpDataSettings( + ) + +# predict outcome during target cohort start/end +# remove people with < 365 days observation +populationSettings <- createStudyPopulationSettings( + binary = T, + firstExposureOnly = T, + washoutPeriod = 365, + removeSubjectsWithPriorOutcome = F, + requireTimeAtRisk = F, + riskWindowStart = 0, + startAnchor = 'cohort start', + riskWindowEnd = 0, + endAnchor = 'cohort end' +) + +# use age/gender in groups and measurement indicators as features +covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + useDemographicsAgeGroup = T, + useMeasurementAnyTimePrior = T, + endDays = -1 +) + +modelDesign3 <- createModelDesign( + targetId = 1, + outcomeId = 5, + restrictPlpDataSettings = restrictPlpDataSettings, + populationSettings = populationSettings, + covariateSettings = covariateSettings, + featureEngineeringSettings = createFeatureEngineeringSettings(), + sampleSettings = createSampleSettings(), + splitSettings = createDefaultSplitSetting(), + preprocessSettings = createPreprocessSettings(), + modelSettings = setGradientBoostingMachine() + )
-
-

-Running multiple models

-

As we will be downloading loads of data in the multiple plp analysis it is useful to set the Andromeda temp folder to a directory with write access and plenty of space. options(andromedaTempFolder = "c:/andromedaTemp")

+
+

Running multiple models +

+

As we will be downloading loads of data in the multiple plp analysis +it is useful to set the Andromeda temp folder to a directory with write +access and plenty of space. +options(andromedaTempFolder = "c:/andromedaTemp")

To run the study requires setting up a connectionDetails object

-dbms <- "your dbms"
-user <- "your username"
-pw <- "your password"
-server <- "your server"
-port <- "your port"
-
-connectionDetails <- DatabaseConnector::createConnectionDetails(dbms = dbms,
-                                                                server = server,
-                                                                user = user,
-                                                                password = pw,
-                                                                port = port)
-

Next you need to specify the cdmDatabaseSchema where your cdm database is found and workDatabaseSchema where your target population and outcome cohorts are and you need to specify a label for the database name: a string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported).

-
cdmDatabaseSchema <- "your cdmDatabaseSchema"
-workDatabaseSchema <- "your workDatabaseSchema"
-cdmDatabaseName <- "your cdmDatabaseName"
-cohortTable <- "your cohort table",
-
-databaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails, 
-  cdmDatabaseSchema = cdmDatabaseSchema, 
-  cdmDatabaseName = cdmDatabaseName , 
-  cohortDatabaseSchema = workDatabaseSchema, 
-  cohortTable = cohortTable, 
-  outcomeDatabaseSchema = workDatabaseSchema, 
-  outcomeTable = cohortTable 
-  cdmVersion = 5
-    )
+dbms <- "your dbms" +user <- "your username" +pw <- "your password" +server <- "your server" +port <- "your port" + +connectionDetails <- DatabaseConnector::createConnectionDetails(dbms = dbms, + server = server, + user = user, + password = pw, + port = port)
+

Next you need to specify the cdmDatabaseSchema where your cdm +database is found and workDatabaseSchema where your target population +and outcome cohorts are and you need to specify a label for the database +name: a string with a shareable name of the database (this will be shown +to OHDSI researchers if the results get transported).

+
cdmDatabaseSchema <- "your cdmDatabaseSchema"
+workDatabaseSchema <- "your workDatabaseSchema"
+cdmDatabaseName <- "your cdmDatabaseName"
+cohortTable <- "your cohort table",
+
+databaseDetails <- createDatabaseDetails(
+  connectionDetails = connectionDetails, 
+  cdmDatabaseSchema = cdmDatabaseSchema, 
+  cdmDatabaseName = cdmDatabaseName , 
+  cohortDatabaseSchema = workDatabaseSchema, 
+  cohortTable = cohortTable, 
+  outcomeDatabaseSchema = workDatabaseSchema, 
+  outcomeTable = cohortTable 
+  cdmVersion = 5
+    )

Now you can run the multiple patient-level prediction analysis:

-results <- runMultiplePlp(
-  databaseDetails = databaseDetails, 
-  modelDesignList = list(
-    modelDesign1, 
-    modelDesign2, 
-    modelDesign3
-    ), 
-  onlyFetchData = F, 
-  splitSettings = createDefaultSplitSetting(), 
-  logSettings = createLogSettings(), 
-  saveDirectory =  "./PlpMultiOutput"
-  )
-

This will then save all the plpData objects from the study into “./PlpMultiOutput/plpData_T1_L” and the results into “./PlpMultiOutput/Analysis_”. The csv file named settings.csv found in “./PlpMultiOutput” has a row for each prediction model developed and points to the plpData and settings used for the model development, it also has descriptions of the cohorts if these are input by the user.

-

Note that if for some reason the run is interrupted, e.g. because of an error, a new call to runMultiplePlp will continue and not restart until you remove the output folder.

+results <- runMultiplePlp( + databaseDetails = databaseDetails, + modelDesignList = list( + modelDesign1, + modelDesign2, + modelDesign3 + ), + onlyFetchData = F, + logSettings = createLogSettings(), + saveDirectory = "./PlpMultiOutput" + )
+

This will then save all the plpData objects from the study into +“./PlpMultiOutput/plpData_T1_L” and the results into +“./PlpMultiOutput/Analysis_”. The csv file named settings.csv found +in “./PlpMultiOutput” has a row for each prediction model developed and +points to the plpData and settings used for the model development, it +also has descriptions of the cohorts if these are input by the user.

+

Note that if for some reason the run is interrupted, e.g. because of +an error, a new call to runMultiplePlp will continue and +not restart until you remove the output folder.

-
-

-Validating multiple models

-

If you have access to multiple databases on the same server in different schemas you could evaluate accross these using this call:

+
+

Validating multiple models +

+

If you have access to multiple databases on the same server in +different schemas you could evaluate accross these using this call:

-validationDatabaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails, 
-  cdmDatabaseSchema = 'new cdm schema', 
-  cdmDatabaseName = 'validation database', 
-  cohortDatabaseSchema = workDatabaseSchema, 
-  cohortTable = cohortTable, 
-  outcomeDatabaseSchema = workDatabaseSchema, 
-  outcomeTable = cohortTable, 
-  cdmVersion = 5
-  )
-
-val <- validateMultiplePlp(
-  analysesLocation = "./PlpMultiOutput",
-  valdiationDatabaseDetails = validationDatabaseDetails,
-  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
-  recalibrate = NULL,
-  saveDirectory = "./PlpMultiOutput/validation"
-  )
-

This then saves the external validation results in the validation folder of the main study (the outputLocation you used in runPlpAnalyses).

+validationDatabaseDetails <- createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = 'new cdm schema', + cdmDatabaseName = 'validation database', + cohortDatabaseSchema = workDatabaseSchema, + cohortTable = cohortTable, + outcomeDatabaseSchema = workDatabaseSchema, + outcomeTable = cohortTable, + cdmVersion = 5 + ) + +val <- validateMultiplePlp( + analysesLocation = "./PlpMultiOutput", + valdiationDatabaseDetails = validationDatabaseDetails, + validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), + recalibrate = NULL, + saveDirectory = "./PlpMultiOutput/Validation" + )
+

This then saves the external validation results in the +Validation folder of the main study (the outputLocation you +used in runPlpAnalyses).

-
-

-Viewing the results

+
+

Viewing the results +

To view the results for the multiple prediction analysis:

-viewMultiplePlp(analysesLocation="./PlpMultiOutput")
-

If the validation directory in “./PlpMultiOutput” has results, the external validation will also be displayed.

+viewMultiplePlp(analysesLocation="./PlpMultiOutput")
+

If the validation directory in “./PlpMultiOutput” has a sqlite +results database, the external validation will also be displayed.

-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

@@ -464,11 +546,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -477,5 +561,7 @@

+ + diff --git a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingMultiplePredictiveModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingPredictiveModels.html b/docs/articles/BuildingPredictiveModels.html index 1cc607322..4e13c99f2 100644 --- a/docs/articles/BuildingPredictiveModels.html +++ b/docs/articles/BuildingPredictiveModels.html @@ -19,6 +19,8 @@ + +
+
@@ -146,65 +161,224 @@

2022-03-09

-
-

-Introduction

-

Observational healthcare data, such as administrative claims and electronic health records, are increasingly used for clinical characterization of disease progression, quality improvement, and population-level effect estimation for medical product safety surveillance and comparative effectiveness. Advances in machine learning for large dataset analysis have led to increased interest in applying patient-level prediction on this type of data. Patient-level prediction offers the potential for medical practice to move beyond average treatment effects and to consider personalized risks as part of clinical decision-making. However, many published efforts in patient-level-prediction do not follow the model development guidelines, fail to perform extensive external validation, or provide insufficient model details that limits the ability of independent researchers to reproduce the models and perform external validation. This makes it hard to fairly evaluate the predictive performance of the models and reduces the likelihood of the model being used appropriately in clinical practice. To improve standards, several papers have been written detailing guidelines for best practices in developing and reporting prediction models.

-

The Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis (TRIPOD) statement provides clear recommendations for reporting prediction model development and validation and addresses some of the concerns related to transparency. However, data structure heterogeneity and inconsistent terminologies still make collaboration and model sharing difficult as different researchers are often required to write new code to extract the data from their databases and may define variables differently.

-

In our paper, we propose a standardised framework for patient-level prediction that utilizes the OMOP Common Data Model (CDM) and standardized vocabularies, and describe the open-source software that we developed implementing the framework’s pipeline. The framework is the first to support existing best practice guidelines and will enable open dissemination of models that can be extensively validated across the network of OHDSI collaborators.

-

Figure 1, illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time.

-
-

The prediction problem

-
-

As shown in Figure 2, to define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented in Figure 3.

-
-

Design choices

-
-
-

Examples of prediction problems

-
-

This vignette describes how you can use the PatientLevelPrediction package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the InstallationGuide.

-
-
-

-Study specification

-

We have to clearly specify our study upfront to be able to implement it. This means we need to define the prediction problem we like to address, in which population we will build the model, which model we will build and how we will evaluate its performance. To guide you through this process we will use a “Disease onset and progression” prediction type as an example.

-
-

-Problem definition 1: Stroke in afibrilation patients

-

Atrial fibrillation is a disease characterized by an irregular heart rate that can cause poor blood flow. Patients with atrial fibrillation are at increased risk of ischemic stroke. Anticoagulation is a recommended prophylaxis treatment strategy for patients at high risk of stroke, though the underuse of anticoagulants and persistent severity of ischemic stroke represents a substantial unmet medical need. Various strategies have been developed to predict risk of ischemic stroke in patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed as a risk score based on history of congestive heart failure, hypertension, age>=75, diabetes and stroke. CHADS2 was initially derived using Medicare claims data, where it achieved good discrimination (AUC=0.82). However, subsequent external validation studies revealed the CHADS2 had substantially lower predictive accuracy (Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have been developed and evaluated, including the extension of CHADS2Vasc. The management of atrial fibrillation has evolved substantially over the last decade, for various reasons that include the introduction of novel oral anticoagulants. With these innovations has come a renewed interest in greater precision medicine for stroke prevention.

-

We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question:

-

Amongst patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year?

-

We will define ‘patients who are newly diagnosed with Atrial Fibrillation’ as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define ‘Ischemic stroke events’ as ischemic stroke condition records during an inpatient or ER visit; successive records with > 180 day gap are considered independent episodes.

-
-
-

-Problem definition 2: Angioedema in ACE inhibitor users

-

Angiotensin converting enzyme inhibitors (ACE inhibitors) are medications used by patients with hypertension that widen the blood vessles and therefore increse the amount of blood pumped by the heart and decreases blood pressure. Ace inhibitors reduce a patients risk of cardiovasular disease but can lead to drug-induced angioedema.

-

We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question:

-

Amongst patients who are newly dispensed an ACE inhibitor, which patients will go on to have angioedema within 1 year?

-

We will define ‘patients who are newly dispensed an ACE inhibitor’ as the first drug record of sny ACE inhibitor, […]which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define ‘angioedema’ as an angioedema condition record.

-
-
-

-Study population definition

-

The final study population in which we will develop our model is often a subset of the Target population, because we will e.g. apply criteria that are dependent on T and O or we want to do sensitivity analyses with subpopulations of T. For this we have to answer the following questions:

+
+

Introduction +

+

Observational healthcare data, such as administrative claims and +electronic health records, are increasingly used for clinical +characterization of disease progression, quality improvement, and +population-level effect estimation for medical product safety +surveillance and comparative effectiveness. Advances in machine learning +for large dataset analysis have led to increased interest in applying +patient-level prediction on this type of data. Patient-level prediction +offers the potential for medical practice to move beyond average +treatment effects and to consider personalized risks as part of clinical +decision-making. However, many published efforts in +patient-level-prediction do not follow the model development guidelines, +fail to perform extensive external validation, or provide insufficient +model details that limits the ability of independent researchers to +reproduce the models and perform external validation. This makes it hard +to fairly evaluate the predictive performance of the models and reduces +the likelihood of the model being used appropriately in clinical +practice. To improve standards, several papers have been written +detailing guidelines for best practices in developing and reporting +prediction models.

+

The Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis (TRIPOD) statement +provides clear recommendations for reporting prediction model +development and validation and addresses some of the concerns related to +transparency. However, data structure heterogeneity and inconsistent +terminologies still make collaboration and model sharing difficult as +different researchers are often required to write new code to extract +the data from their databases and may define variables differently.

+

In our paper, +we propose a standardised framework for patient-level prediction that +utilizes the OMOP Common Data Model (CDM) and standardized vocabularies, +and describe the open-source software that we developed implementing the +framework’s pipeline. The framework is the first to support existing +best practice guidelines and will enable open dissemination of models +that can be extensively validated across the network of OHDSI +collaborators.

+

Figure 1, illustrates the prediction problem we address. Among a +population at risk, we aim to predict which patients at a defined moment +in time (t = 0) will experience some outcome during a time-at-risk. +Prediction is done using only information about the patients in an +observation window prior to that moment in time.

+
+The prediction problem
The prediction problem
+
+

As shown in Figure 2, to define a prediction problem we have to +define t=0 by a Target Cohort (T), the outcome we like to predict by an +outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to +make design choices for the model we like to develop, and determine the +observational datasets to perform internal and external validation. This +conceptual framework works for all type of prediction problems, for +example those presented in Figure 3.

+
+Design choices
Design choices
+
+
+Examples of prediction problems
Examples of prediction problems
+
+

This vignette describes how you can use the +PatientLevelPrediction package to build patient-level +predictive models. The package enables data extraction, model building, +and model evaluation using data from databases that are translated into +the OMOP CDM. In this vignette we assume you have installed the package +correctly using the InstallationGuide.

+
+
+

Study specification +

+

We have to clearly specify our study upfront to be able to implement +it. This means we need to define the prediction problem we like to +address, in which population we will build the model, which model we +will build and how we will evaluate its performance. To guide you +through this process we will use a “Disease onset and progression” +prediction type as an example.

+
+

Problem definition 1: Stroke in afibrilation patients +

+

Atrial fibrillation is a disease characterized by an irregular heart +rate that can cause poor blood flow. Patients with atrial fibrillation +are at increased risk of ischemic stroke. Anticoagulation is a +recommended prophylaxis treatment strategy for patients at high risk of +stroke, though the underuse of anticoagulants and persistent severity of +ischemic stroke represents a substantial unmet medical need. Various +strategies have been developed to predict risk of ischemic stroke in +patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed +as a risk score based on history of congestive heart failure, +hypertension, age>=75, diabetes and stroke. CHADS2 was initially +derived using Medicare claims data, where it achieved good +discrimination (AUC=0.82). However, subsequent external validation +studies revealed the CHADS2 had substantially lower predictive accuracy +(Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have +been developed and evaluated, including the extension of CHADS2Vasc. The +management of atrial fibrillation has evolved substantially over the +last decade, for various reasons that include the introduction of novel +oral anticoagulants. With these innovations has come a renewed interest +in greater precision medicine for stroke prevention.

+

We will apply the PatientLevelPrediction package to observational +healthcare data to address the following patient-level prediction +question:

+

Amongst patients who are newly diagnosed with Atrial Fibrillation, +which patients will go on to have Ischemic Stroke within 1 year?

+

We will define ‘patients who are newly diagnosed with Atrial +Fibrillation’ as the first condition record of cardiac arrhythmia, which +is followed by another cardiac arrhythmia condition record, at least two +drug records for a drug used to treat arrhythmias, or a procedure to +treat arrhythmias. We will define ‘Ischemic stroke events’ as ischemic +stroke condition records during an inpatient or ER visit; successive +records with > 180 day gap are considered independent episodes.

+
+
+

Problem definition 2: Angioedema in ACE inhibitor users +

+

Angiotensin converting enzyme inhibitors (ACE inhibitors) are +medications used by patients with hypertension that widen the blood +vessles and therefore increse the amount of blood pumped by the heart +and decreases blood pressure. Ace inhibitors reduce a patients risk of +cardiovasular disease but can lead to drug-induced angioedema.

+

We will apply the PatientLevelPrediction package to observational +healthcare data to address the following patient-level prediction +question:

+

Amongst patients who are newly dispensed an ACE inhibitor, which +patients will go on to have angioedema within 1 year?

+

We will define ‘patients who are newly dispensed an ACE inhibitor’ as +the first drug record of sny ACE inhibitor, […]which is followed by +another cardiac arrhythmia condition record, at least two drug records +for a drug used to treat arrhythmias, or a procedure to treat +arrhythmias. We will define ‘angioedema’ as an angioedema condition +record.

+
+
+

Study population definition +

+

The final study population in which we will develop our model is +often a subset of the Target population, because we will e.g. apply +criteria that are dependent on T and O or we want to do sensitivity +analyses with subpopulations of T. For this we have to answer the +following questions:

    -
  • What is the minimum amount of observation time we require before the start of the target cohort? This choice could depend on the available patient time in your training data, but also on the time you expect to be available in the data sources you want to apply the model on in the future. The longer the minimum observation time, the more baseline history time is available for each person to use for feature extraction, but the fewer patients will qualify for analysis. Moreover, there could be clinical reasons to choose a short or longer lookback period. For our example, we will use a prior history as lookback period (washout period).

  • -
  • Can patients enter the target cohort multiple times? In the target cohort definition, a person may qualify for the cohort multiple times during different spans of time, for example if they had different episodes of a disease or separate periods of exposure to a medical product. The cohort definition does not necessarily apply a restriction to only let the patients enter once, but in the context of a particular patient-level prediction problem, a user may want to restrict the cohort to the first qualifying episode. In our example, a person could only enter the target cohort once since our criteria was based on first occurrence of atrial fibrillation.

  • -
  • Do we allow persons to enter the cohort if they experienced the outcome before? Do we allow persons to enter the target cohort if they experienced the outcome before qualifying for the target cohort? Depending on the particular patient-level prediction problem, there may be a desire to predict ‘incident’ first occurrence of an outcome, in which case patients who have previously experienced the outcome are not ‘at-risk’ for having a first occurrence and therefore should be excluded from the target cohort. In other circumstances, there may be a desire to predict ‘prevalent’ episodes, whereby patients with prior outcomes can be included in the analysis and the prior outcome itself can be a predictor of future outcomes. For our prediction example, the answer to this question is ‘Yes, allow persons with prior outcomes’ because we know from the CHADS2 score that prior strokes are very predictive of future strokes. If this answer would have been ‘No’ we also have to decide how long we would look back for previous occurrences of the outcome.

  • -
  • How do we define the period in which we will predict our outcome relative to the target cohort start? We actually have to make two decisions to answer that question. First, does the time-at-risk window start at the date of the start of the target cohort or later? Arguments to make it start later could be that you want to avoid outcomes that were entered late in the record that actually occurred before the start of the target cohort or you want to leave a gap where interventions to prevent the outcome could theoretically be implemented. Second, you need to define the time-at-risk by setting the risk window end, as some specification of days offset relative to the target cohort start or end dates. For our problem we will predict in a ‘time-at-risk’ window starting 1 day after the start of the target cohort up to 365 days later (to look for 1-year risk following atrial fibrillation diagnosis).

  • -
  • Do we require a minimum amount of time-at-risk? We have to decide if we want to include patients that did not experience the outcome but did leave the database earlier than the end of our time-at-risk period. These patients may experience the outcome when we do not observe them. For our prediction problem we decide to answer this question with ‘Yes, require a mimimum time-at-risk’ for that reason. Furthermore, we have to decide if this constraint also applies to persons who experienced the outcome or we will include all persons with the outcome irrespective of their total time at risk. For example, if the outcome is death, then persons with the outcome are likely censored before the full time-at-risk period is complete.

  • +
  • What is the minimum amount of observation time we require +before the start of the target cohort? This choice could depend on +the available patient time in your training data, but also on the time +you expect to be available in the data sources you want to apply the +model on in the future. The longer the minimum observation time, the +more baseline history time is available for each person to use for +feature extraction, but the fewer patients will qualify for analysis. +Moreover, there could be clinical reasons to choose a short or longer +lookback period. For our example, we will use a prior history as +lookback period (washout period).

  • +
  • Can patients enter the target cohort multiple times? In +the target cohort definition, a person may qualify for the cohort +multiple times during different spans of time, for example if they had +different episodes of a disease or separate periods of exposure to a +medical product. The cohort definition does not necessarily apply a +restriction to only let the patients enter once, but in the context of a +particular patient-level prediction problem, a user may want to restrict +the cohort to the first qualifying episode. In our example, a person +could only enter the target cohort once since our criteria was based on +first occurrence of atrial fibrillation.

  • +
  • Do we allow persons to enter the cohort if they experienced +the outcome before? Do we allow persons to enter the target cohort +if they experienced the outcome before qualifying for the target cohort? +Depending on the particular patient-level prediction problem, there may +be a desire to predict ‘incident’ first occurrence of an outcome, in +which case patients who have previously experienced the outcome are not +‘at-risk’ for having a first occurrence and therefore should be excluded +from the target cohort. In other circumstances, there may be a desire to +predict ‘prevalent’ episodes, whereby patients with prior outcomes can +be included in the analysis and the prior outcome itself can be a +predictor of future outcomes. For our prediction example, the answer to +this question is ‘Yes, allow persons with prior outcomes’ because we +know from the CHADS2 score that prior strokes are very predictive of +future strokes. If this answer would have been ‘No’ we also have to +decide how long we would look back for previous occurrences of the +outcome.

  • +
  • How do we define the period in which we will predict our +outcome relative to the target cohort start? We actually have to +make two decisions to answer that question. First, does the time-at-risk +window start at the date of the start of the target cohort or later? +Arguments to make it start later could be that you want to avoid +outcomes that were entered late in the record that actually occurred +before the start of the target cohort or you want to leave a gap where +interventions to prevent the outcome could theoretically be implemented. +Second, you need to define the time-at-risk by setting the risk window +end, as some specification of days offset relative to the target cohort +start or end dates. For our problem we will predict in a ‘time-at-risk’ +window starting 1 day after the start of the target cohort up to 365 +days later (to look for 1-year risk following atrial fibrillation +diagnosis).

  • +
  • Do we require a minimum amount of time-at-risk? We have +to decide if we want to include patients that did not experience the +outcome but did leave the database earlier than the end of our +time-at-risk period. These patients may experience the outcome when we +do not observe them. For our prediction problem we decide to answer this +question with ‘Yes, require a mimimum time-at-risk’ for that reason. +Furthermore, we have to decide if this constraint also applies to +persons who experienced the outcome or we will include all persons with +the outcome irrespective of their total time at risk. For example, if +the outcome is death, then persons with the outcome are likely censored +before the full time-at-risk period is complete.

-
-

-Model development settings

-

To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the AddingCustomModels vignette.

-

Our package currently contains the following algorithms to choose from:

+
+

Model development settings +

+

To develop the model we have to decide which algorithm(s) we like to +train. We see the selection of the best algorithm for a certain +prediction problem as an empirical question, i.e. you need to let the +data speak for itself and try different approaches to find the best one. +There is no algorithm that will work best for all problems (no free +lunch). In our package we therefore aim to implement many algorithms. +Furthermore, we made the system modular so you can add your own custom +algorithms as described in more detail in the AddingCustomModels +vignette.

+

Our package currently contains the following algorithms to choose +from:

@@ -219,77 +393,168 @@

- - - + + - - + + - - + - - + + - - + + - - + + - - + +
Regularized Logistic RegressionLasso logistic regression belongs to the family of generalized linear models, where a linear combination of the variables is learned and finally a logistic function maps the linear combination to a value between 0 and 1. The lasso regularization adds a cost based on model complexity to the objective function when training the model. This cost is the sum of the absolute values of the linear combination of the coefficients. The model automatically performs feature selection by minimizing this cost. We use the Cyclic coordinate descent for logistic, Poisson and survival analysis (Cyclops) package to perform large-scale regularized logistic regression: https://github.com/OHDSI/Cyclops +Lasso logistic regression belongs to the family of generalized +linear models, where a linear combination of the variables is learned +and finally a logistic function maps the linear combination to a value +between 0 and 1. The lasso regularization adds a cost based on model +complexity to the objective function when training the model. This cost +is the sum of the absolute values of the linear combination of the +coefficients. The model automatically performs feature selection by +minimizing this cost. We use the Cyclic coordinate descent for logistic, +Poisson and survival analysis (Cyclops) package to perform large-scale +regularized logistic regression: https://github.com/OHDSI/Cyclops var (starting variance), seed
Gradient boosting machinesGradient boosting machines is a boosting ensemble technique and in our framework it combines multiple decision trees. Boosting works by iteratively adding decision trees but adds more weight to the data-points that are misclassified by prior decision trees in the cost function when training the next tree. We use Extreme Gradient Boosting, which is an efficient implementation of the gradient boosting framework implemented in the xgboost R package available from CRAN.ntree (number of trees), max depth (max levels in tree), min rows (minimum data points in in node), learning rate, balance (balance class labels), seedGradient boosting machines is a boosting ensemble technique and in +our framework it combines multiple decision trees. Boosting works by +iteratively adding decision trees but adds more weight to the +data-points that are misclassified by prior decision trees in the cost +function when training the next tree. We use Extreme Gradient Boosting, +which is an efficient implementation of the gradient boosting framework +implemented in the xgboost R package available from CRAN.ntree (number of trees), max depth (max levels in tree), min rows +(minimum data points in in node), learning rate, balance (balance class +labels), seed
Random forestRandom forest is a bagging ensemble technique that combines multiple decision trees. The idea behind bagging is to reduce the likelihood of overfitting, by using weak classifiers, but combining multiple diverse weak classifiers into a strong classifier. Random forest accomplishes this by training multiple decision trees but only using a subset of the variables in each tree and the subset of variables differ between trees. Our packages uses the sklearn learn implementation of Random Forest in python.mtry (number of features in each tree),ntree (number of trees), maxDepth (max levels in tree), minRows (minimum data points in in node),balance (balance class labels), seedRandom forest is a bagging ensemble technique that combines multiple +decision trees. The idea behind bagging is to reduce the likelihood of +overfitting, by using weak classifiers, but combining multiple diverse +weak classifiers into a strong classifier. Random forest accomplishes +this by training multiple decision trees but only using a subset of the +variables in each tree and the subset of variables differ between trees. +Our packages uses the sklearn learn implementation of Random Forest in +python.mtry (number of features in each tree),ntree (number of trees), +maxDepth (max levels in tree), minRows (minimum data points in in +node),balance (balance class labels), seed
K-nearest neighborsK-nearest neighbors (KNN) is an algorithm that uses some metric to find the K closest labelled data-points, given the specified metric, to a new unlabelled data-point. The prediction of the new data-points is then the most prevalent class of the K-nearest labelled data-points. There is a sharing limitation of KNN, as the model requires labelled data to perform the prediction on new data, and it is often not possible to share this data across data sites.We included the BigKnn classifier developed in OHDSI which is a large scale k-nearest neighbor classifier using the Lucene search engine: https://github.com/OHDSI/BigKnn +K-nearest neighbors (KNN) is an algorithm that uses some metric to +find the K closest labelled data-points, given the specified metric, to +a new unlabelled data-point. The prediction of the new data-points is +then the most prevalent class of the K-nearest labelled data-points. +There is a sharing limitation of KNN, as the model requires labelled +data to perform the prediction on new data, and it is often not possible +to share this data across data sites.We included the BigKnn classifier +developed in OHDSI which is a large scale k-nearest neighbor classifier +using the Lucene search engine: https://github.com/OHDSI/BigKnn k (number of neighbours),weighted (weight by inverse frequency)
Naive BayesThe Naive Bayes algorithm applies the Bayes theorem with the ‘naive’ assumption of conditional independence between every pair of features given the value of the class variable. Based on the likelihood the data belongs to a class and the prior distribution of the class, a posterior distribution is obtained.The Naive Bayes algorithm applies the Bayes theorem with the ‘naive’ +assumption of conditional independence between every pair of features +given the value of the class variable. Based on the likelihood the data +belongs to a class and the prior distribution of the class, a posterior +distribution is obtained. none
AdaBoostAdaBoost is a boosting ensemble technique. Boosting works by iteratively adding classifiers but adds more weight to the data-points that are misclassified by prior classifiers in the cost function when training the next classifier. We use the sklearn ‘AdaboostClassifier’ implementation in Python.nEstimators (the maximum number of estimators at which boosting is terminated), learningRate (learning rate shrinks the contribution of each classifier by learning_rate. There is a trade-off between learningRate and nEstimators)AdaBoost is a boosting ensemble technique. Boosting works by +iteratively adding classifiers but adds more weight to the data-points +that are misclassified by prior classifiers in the cost function when +training the next classifier. We use the sklearn ‘AdaboostClassifier’ +implementation in Python.nEstimators (the maximum number of estimators at which boosting is +terminated), learningRate (learning rate shrinks the contribution of +each classifier by learning_rate. There is a trade-off between +learningRate and nEstimators)
Decision TreeA decision tree is a classifier that partitions the variable space using individual tests selected using a greedy approach. It aims to find partitions that have the highest information gain to separate the classes. The decision tree can easily overfit by enabling a large number of partitions (tree depth) and often needs some regularization (e.g., pruning or specifying hyper-parameters that limit the complexity of the model). We use the sklearn ‘DecisionTreeClassifier’ implementation in Python.maxDepth (the maximum depth of the tree), minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.), seed,classWeight (‘Balance’ or ‘None’)A decision tree is a classifier that partitions the variable space +using individual tests selected using a greedy approach. It aims to find +partitions that have the highest information gain to separate the +classes. The decision tree can easily overfit by enabling a large number +of partitions (tree depth) and often needs some regularization (e.g., +pruning or specifying hyper-parameters that limit the complexity of the +model). We use the sklearn ‘DecisionTreeClassifier’ implementation in +Python.maxDepth (the maximum depth of the tree), +minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early +stopping in tree growth. A node will split if its impurity is above the +threshold, otherwise it is a leaf.), seed,classWeight (‘Balance’ or +‘None’)
Multilayer PerceptionNeural networks contain multiple layers that weight their inputs using a non-linear function. The first layer is the input layer, the last layer is the output layer the between are the hidden layers. Neural networks are generally trained using feed forward back-propagation. This is when you go through the network with a data-point and calculate the error between the true label and predicted label, then go backwards through the network and update the linear function weights based on the error. This can also be performed as a batch, where multiple data-points are feesize (the number of hidden nodes), alpha (the l2 regularisation), seedNeural networks contain multiple layers that weight their inputs +using a non-linear function. The first layer is the input layer, the +last layer is the output layer the between are the hidden layers. Neural +networks are generally trained using feed forward back-propagation. This +is when you go through the network with a data-point and calculate the +error between the true label and predicted label, then go backwards +through the network and update the linear function weights based on the +error. This can also be performed as a batch, where multiple data-points +are feesize (the number of hidden nodes), alpha (the l2 regularisation), +seed
Deep Learning (now in seperate DeepPatientLevelPrediction R package)Deep learning such as deep nets, convolutional neural networks or recurrent neural networks are similar to a neural network but have multiple hidden layers that aim to learn latent representations useful for prediction. In the seperate BuildingDeepLearningModels vignette we describe these models and hyper-parameters in more detailDeep Learning (now in seperate DeepPatientLevelPrediction R +package)Deep learning such as deep nets, convolutional neural networks or +recurrent neural networks are similar to a neural network but have +multiple hidden layers that aim to learn latent representations useful +for prediction. In the seperate BuildingDeepLearningModels vignette we +describe these models and hyper-parameters in more detail see OHDSI/DeepPatientLevelPrediction
-

Furthermore, we have to decide on the covariates that we will use to train our model. This choice can be driven by domain knowledge of available computational resources. In our example, we like to add the Gender, Age, Conditions, Drugs Groups, and Visit Count. We also have to specify in which time windows we will look and we decide to look in year before and any time prior.

-

Finally, we have to define how we will train and test our model on our data, i.e. how we perform internal validation. For this we have to decide how we divide our dataset in a training and testing dataset and how we randomly assign patients to these two sets. Dependent on the size of the training set we can decide how much data we like to use for training, typically this is a 75%, 25% split. If you have very large datasets you can use more data for training. To randomly assign patients to the training and testing set, there are two commonly used approaches:

+

Furthermore, we have to decide on the covariates +that we will use to train our model. This choice can be driven by domain +knowledge of available computational resources. In our example, we like +to add the Gender, Age, Conditions, Drugs Groups, and Visit Count. We +also have to specify in which time windows we will look and we decide to +look in year before and any time prior.

+

Finally, we have to define how we will train and test our model on +our data, i.e. how we perform internal validation. For +this we have to decide how we divide our dataset in a training and +testing dataset and how we randomly assign patients to these two sets. +Dependent on the size of the training set we can decide how much data we +like to use for training, typically this is a 75%, 25% split. If you +have very large datasets you can use more data for training. To randomly +assign patients to the training and testing set, there are two commonly +used approaches:

    -
  1. split by person. In this case a random seed is used to assign the patient to either sets.
  2. -
  3. split by time. In this case a time point is used to split the persons, e.g. 75% of the data is before and 25% is after this date. The advantage of this is that you take into consideration that the health care system has changed over time.
  4. +
  5. split by person. In this case a random seed is used to assign the +patient to either sets.
  6. +
  7. split by time. In this case a time point is used to split the +persons, e.g. 75% of the data is before and 25% is after this date. The +advantage of this is that you take into consideration that the health +care system has changed over time.

We now completely defined our studies and implement them:

-
-

-Example 1: Stroke in afibrilation patients

-
-

-Study Specification

-

For our first prediction model we decide to start with a Regularized Logistic Regression and will use the default parameters. We will do a 75%-25% split by person.

+
+

Example 1: Stroke in afibrilation patients +

+
+

Study Specification +

+

For our first prediction model we decide to start with a Regularized +Logistic Regression and will use the default parameters. We will do a +75%-25% split by person.

--++ @@ -302,11 +567,17 @@

- + - + @@ -362,7 +633,8 @@

- + @@ -370,358 +642,535 @@

Definition
Target Cohort (T)‘Patients who are newly diagnosed with Atrial Fibrillation’ defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias.‘Patients who are newly diagnosed with Atrial Fibrillation’ defined +as the first condition record of cardiac arrhythmia, which is followed +by another cardiac arrhythmia condition record, at least two drug +records for a drug used to treat arrhythmias, or a procedure to treat +arrhythmias.
Outcome Cohort (O)‘Ischemic stroke events’ defined as ischemic stroke condition records during an inpatient or ER visit; successive records with > 180 day gap are considered independent episodes.‘Ischemic stroke events’ defined as ischemic stroke condition +records during an inpatient or ER visit; successive records with > +180 day gap are considered independent episodes.
Time-at-risk (TAR)
CovariatesGender, Age, Conditions (ever before, <365), Drugs Groups (ever before, <365), and Visit CountGender, Age, Conditions (ever before, <365), Drugs Groups (ever +before, <365), and Visit Count
Data split
-

According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later.

-
-
-

-Study implementation

-

Now we have completely design our study we have to implement the study. We have to generate the target and outcome cohorts and we need to develop the R code to run against our CDM that will execute the full study.

-
-

-Cohort instantiation

-

For our study we need to know when a person enters the target and outcome cohorts. This is stored in a table on the server that contains the cohort start date and cohort end date for all subjects for a specific cohort definition. This cohort table has a very simple structure as shown below:

+

According to the best practices we need to make a protocol that +completely specifies how we plan to execute our study. This protocol +will be assessed by the governance boards of the participating data +sources in your network study. For this a template could be used but we +prefer to automate this process as much as possible by adding +functionality to automatically generate study protocol from a study +specification. We will discuss this in more detail later.

+
+
+

Study implementation +

+

Now we have completely design our study we have to implement the +study. We have to generate the target and outcome cohorts and we need to +develop the R code to run against our CDM that will execute the full +study.

+
+

Cohort instantiation +

+

For our study we need to know when a person enters the target and +outcome cohorts. This is stored in a table on the server that contains +the cohort start date and cohort end date for all subjects for a +specific cohort definition. This cohort table has a very simple +structure as shown below:

  • -cohort_definition_id, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts.
  • +cohort_definition_id, a unique identifier for +distinguishing between different types of cohorts, e.g. cohorts of +interest and outcome cohorts.
  • -subject_id, a unique identifier corresponding to the person_id in the CDM.
  • +subject_id, a unique identifier corresponding to the +person_id in the CDM.
  • -cohort_start_date, the date the subject enters the cohort.
  • +cohort_start_date, the date the subject enters the +cohort.
  • -cohort_end_date, the date the subject leaves the cohort.
  • +cohort_end_date, the date the subject leaves the +cohort.
-

How do we fill this table according to our cohort definitions? There are two options for this:

+

How do we fill this table according to our cohort definitions? There +are two options for this:

    -
  1. use the interactive cohort builder tool in ATLAS which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table.

  2. -
  3. write your own custom SQL statements to fill the cohort table.

  4. +
  5. use the interactive cohort builder tool in ATLAS which can be used to create +cohorts based on inclusion criteria and will automatically populate this +cohort table.

  6. +
  7. write your own custom SQL statements to fill the cohort +table.

-

Both methods are described below for our example prediction problem.

+

Both methods are described below for our example prediction +problem.

-
-

-ATLAS cohort builder

-
-

Target Cohort Atrial Fibrillation

+
+

ATLAS cohort builder +

+
+Target Cohort Atrial Fibrillation
Target Cohort Atrial Fibrillation
-

ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person’s episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS.

-
-

Outcome Cohort Stroke

+

ATLAS allows you to define cohorts interactively by specifying cohort +entry and cohort exit criteria. Cohort entry criteria involve selecting +one or more initial events, which determine the start date for cohort +entry, and optionally specifying additional inclusion criteria which +filter to the qualifying events. Cohort exit criteria are applied to +each cohort entry record to determine the end date when the person’s +episode no longer qualifies for the cohort. For the outcome cohort the +end date is less relevant. As an example, Figure 4 shows how we created +the Atrial Fibrillation cohort and Figure 5 shows how we created the +stroke cohort in ATLAS.

+
+Outcome Cohort Stroke
Outcome Cohort Stroke

The T and O cohorts can be found here:

-

In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages (link).

-

Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4.

-
-
-

-Custom cohorts

-

It is also possible to create cohorts without the use of ATLAS. Using custom cohort code (SQL) you can make more advanced cohorts if needed.

-

For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called AfStrokeCohorts.sql with the following contents:

-
/***********************************
-File AfStrokeCohorts.sql 
-***********************************/
-/*
-Create a table to store the persons in the T and C cohort
-*/
-
-IF OBJECT_ID('@resultsDatabaseSchema.PLPAFibStrokeCohort', 'U') IS NOT NULL 
-DROP TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort;
-
-CREATE TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort 
-( 
-cohort_definition_id INT, 
-subject_id BIGINT,
-cohort_start_date DATE, 
-cohort_end_date DATE
-);
-
-
-/*
-T cohort:  [PatientLevelPrediction vignette]:  T : patients who are newly 
-diagnosed with Atrial fibrillation
-- persons with a condition occurrence record of 'Atrial fibrillation' or 
-any descendants, indexed at the first diagnosis
-- who have >1095 days of prior observation before their first diagnosis
-- and have no warfarin exposure any time prior to first AFib diagnosis
-*/
-INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, 
-subject_id, 
-cohort_start_date, 
-cohort_end_date)
-SELECT 1 AS cohort_definition_id,
-AFib.person_id AS subject_id,
-AFib.condition_start_date AS cohort_start_date,
-observation_period.observation_period_end_date AS cohort_end_date
-FROM
-(
-  SELECT person_id, min(condition_start_date) as condition_start_date
-  FROM @cdmDatabaseSchema.condition_occurrence
-  WHERE condition_concept_id IN (SELECT descendant_concept_id FROM 
-  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-  (313217 /*atrial fibrillation*/))
-  GROUP BY person_id
-) AFib
-  INNER JOIN @cdmDatabaseSchema.observation_period
-  ON AFib.person_id = observation_period.person_id
-  AND AFib.condition_start_date >= dateadd(dd,1095, 
-  observation_period.observation_period_start_date)
-  AND AFib.condition_start_date <= observation_period.observation_period_end_date
-  LEFT JOIN
-  (
-  SELECT person_id, min(drug_exposure_start_date) as drug_exposure_start_date
-  FROM @cdmDatabaseSchema.drug_exposure
-  WHERE drug_concept_id IN (SELECT descendant_concept_id FROM 
-  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-  (1310149 /*warfarin*/))
-  GROUP BY person_id
-  ) warfarin
-  ON Afib.person_id = warfarin.person_id
-  AND Afib.condition_start_date > warfarin.drug_exposure_start_date
-  WHERE warfarin.person_id IS NULL
-  ;
-  
-  /*
-  C cohort:  [PatientLevelPrediction vignette]:  O: Ischemic stroke events
-  - inpatient visits that include a condition occurrence record for 
-  'cerebral infarction' and descendants, 'cerebral thrombosis', 
-  'cerebral embolism', 'cerebral artery occlusion' 
-  */
-  INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, 
-  subject_id, 
-  cohort_start_date, 
-  cohort_end_date)
-  SELECT 2 AS cohort_definition_id,
-  visit_occurrence.person_id AS subject_id,
-  visit_occurrence.visit_start_date AS cohort_start_date,
-  visit_occurrence.visit_end_date AS cohort_end_date
-  FROM  
-  (
-  SELECT person_id, condition_start_date
-  FROM @cdmDatabaseSchema.condition_occurrence
-  WHERE condition_concept_id IN (SELECT DISTINCT descendant_concept_id FROM 
-  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-  (443454 /*cerebral infarction*/) OR descendant_concept_id IN 
-  (441874 /*cerebral thrombosis*/, 375557 /*cerebral embolism*/, 
-  372924 /*cerebral artery occlusion*/))
-  ) stroke
-  INNER JOIN @cdmDatabaseSchema.visit_occurrence
-  ON stroke.person_id = visit_occurrence.person_id
-  AND stroke.condition_start_date >= visit_occurrence.visit_start_date
-  AND stroke.condition_start_date <= visit_occurrence.visit_end_date
-  AND visit_occurrence.visit_concept_id IN (9201, 262 /*'Inpatient Visit'  or 
-  'Emergency Room and Inpatient Visit'*/)
-  GROUP BY visit_occurrence.person_id, visit_occurrence.visit_start_date, 
-  visit_occurrence.visit_end_date
-  ;
-  
-

This is parameterized SQL which can be used by the SqlRender package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in SqlRender, we can make sure the SQL code can be run in many different environments.

-

To execute this sql against our CDM we first need to tell R how to connect to the server. PatientLevelPrediction uses the DatabaseConnector package, which provides a function called createConnectionDetails. Type ?createConnectionDetails for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code:

+

In depth explanation of cohort creation in ATLAS is out of scope of +this vignette but can be found on the OHDSI wiki pages (link).

+

Note that when a cohort is created in ATLAS the cohortid is needed to +extract the data in R. The cohortid can be found at the top of the ATLAS +screen, e.g. 1769447 in Figure 4.

+
+
+

Custom cohorts +

+

It is also possible to create cohorts without the use of ATLAS. Using +custom cohort code (SQL) you can make more advanced cohorts if +needed.

+

For our example study, we need to create at table to hold the cohort +data and we need to create SQL code to instantiate this table for both +the AF and Stroke cohorts. Therefore, we create a file called +AfStrokeCohorts.sql with the following contents:

+
/***********************************
+File AfStrokeCohorts.sql 
+***********************************/
+/*
+Create a table to store the persons in the T and C cohort
+*/
+
+IF OBJECT_ID('@resultsDatabaseSchema.PLPAFibStrokeCohort', 'U') IS NOT NULL 
+DROP TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort;
+
+CREATE TABLE @resultsDatabaseSchema.PLPAFibStrokeCohort 
+( 
+cohort_definition_id INT, 
+subject_id BIGINT,
+cohort_start_date DATE, 
+cohort_end_date DATE
+);
+
+
+/*
+T cohort:  [PatientLevelPrediction vignette]:  T : patients who are newly 
+diagnosed with Atrial fibrillation
+- persons with a condition occurrence record of 'Atrial fibrillation' or 
+any descendants, indexed at the first diagnosis
+- who have >1095 days of prior observation before their first diagnosis
+- and have no warfarin exposure any time prior to first AFib diagnosis
+*/
+INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, 
+subject_id, 
+cohort_start_date, 
+cohort_end_date)
+SELECT 1 AS cohort_definition_id,
+AFib.person_id AS subject_id,
+AFib.condition_start_date AS cohort_start_date,
+observation_period.observation_period_end_date AS cohort_end_date
+FROM
+(
+  SELECT person_id, min(condition_start_date) as condition_start_date
+  FROM @cdmDatabaseSchema.condition_occurrence
+  WHERE condition_concept_id IN (SELECT descendant_concept_id FROM 
+  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
+  (313217 /*atrial fibrillation*/))
+  GROUP BY person_id
+) AFib
+  INNER JOIN @cdmDatabaseSchema.observation_period
+  ON AFib.person_id = observation_period.person_id
+  AND AFib.condition_start_date >= dateadd(dd,1095, 
+  observation_period.observation_period_start_date)
+  AND AFib.condition_start_date <= observation_period.observation_period_end_date
+  LEFT JOIN
+  (
+  SELECT person_id, min(drug_exposure_start_date) as drug_exposure_start_date
+  FROM @cdmDatabaseSchema.drug_exposure
+  WHERE drug_concept_id IN (SELECT descendant_concept_id FROM 
+  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
+  (1310149 /*warfarin*/))
+  GROUP BY person_id
+  ) warfarin
+  ON Afib.person_id = warfarin.person_id
+  AND Afib.condition_start_date > warfarin.drug_exposure_start_date
+  WHERE warfarin.person_id IS NULL
+  ;
+  
+  /*
+  C cohort:  [PatientLevelPrediction vignette]:  O: Ischemic stroke events
+  - inpatient visits that include a condition occurrence record for 
+  'cerebral infarction' and descendants, 'cerebral thrombosis', 
+  'cerebral embolism', 'cerebral artery occlusion' 
+  */
+  INSERT INTO @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, 
+  subject_id, 
+  cohort_start_date, 
+  cohort_end_date)
+  SELECT 2 AS cohort_definition_id,
+  visit_occurrence.person_id AS subject_id,
+  visit_occurrence.visit_start_date AS cohort_start_date,
+  visit_occurrence.visit_end_date AS cohort_end_date
+  FROM  
+  (
+  SELECT person_id, condition_start_date
+  FROM @cdmDatabaseSchema.condition_occurrence
+  WHERE condition_concept_id IN (SELECT DISTINCT descendant_concept_id FROM 
+  @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
+  (443454 /*cerebral infarction*/) OR descendant_concept_id IN 
+  (441874 /*cerebral thrombosis*/, 375557 /*cerebral embolism*/, 
+  372924 /*cerebral artery occlusion*/))
+  ) stroke
+  INNER JOIN @cdmDatabaseSchema.visit_occurrence
+  ON stroke.person_id = visit_occurrence.person_id
+  AND stroke.condition_start_date >= visit_occurrence.visit_start_date
+  AND stroke.condition_start_date <= visit_occurrence.visit_end_date
+  AND visit_occurrence.visit_concept_id IN (9201, 262 /*'Inpatient Visit'  or 
+  'Emergency Room and Inpatient Visit'*/)
+  GROUP BY visit_occurrence.person_id, visit_occurrence.visit_start_date, 
+  visit_occurrence.visit_end_date
+  ;
+  
+

This is parameterized SQL which can be used by the SqlRender +package. We use parameterized SQL so we do not have to pre-specify the +names of the CDM and result schemas. That way, if we want to run the SQL +on a different schema, we only need to change the parameter values; we +do not have to change the SQL code. By also making use of translation +functionality in SqlRender, we can make sure the SQL code +can be run in many different environments.

+

To execute this sql against our CDM we first need to tell R how to +connect to the server. PatientLevelPrediction uses the DatabaseConnector +package, which provides a function called +createConnectionDetails. Type +?createConnectionDetails for the specific settings required +for the various database management systems (DBMS). For example, one +might connect to a PostgreSQL database using this code:

-  connectionDetails <- createConnectionDetails(dbms = "postgresql", 
-  server = "localhost/ohdsi", 
-  user = "joe", 
-  password = "supersecret")
-  
-  cdmDatabaseSchema <- "my_cdm_data"
-  cohortsDatabaseSchema <- "my_results"
-  cdmVersion <- "5"
-

The last three lines define the cdmDatabaseSchema and cohortsDatabaseSchema variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example cdmDatabaseSchema <- "my_cdm_data.dbo".

+ connectionDetails <- createConnectionDetails(dbms = "postgresql", + server = "localhost/ohdsi", + user = "joe", + password = "supersecret") + + cdmDatabaseSchema <- "my_cdm_data" + cohortsDatabaseSchema <- "my_results" + cdmVersion <- "5"
+

The last three lines define the cdmDatabaseSchema and +cohortsDatabaseSchema variables, as well as the CDM +version. We will use these later to tell R where the data in CDM format +live, where we want to create the cohorts of interest, and what version +CDM is used. Note that for Microsoft SQL Server, databaseschemas need to +specify both the database and the schema, so for example +cdmDatabaseSchema <- "my_cdm_data.dbo".

-  library(SqlRender)
-  sql <- readSql("AfStrokeCohorts.sql")
-  sql <- renderSql(sql,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  cohortsDatabaseSchema = cohortsDatabaseSchema,
-  post_time = 30,
-  pre_time = 365)$sql
-  sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql
-  
-  connection <- connect(connectionDetails)
-  executeSql(connection, sql)
-

In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the connectionDetails. Next, we connect to the server, and submit the rendered and translated SQL.

-

If all went well, we now have a table with the events of interest. We can see how many events per type:

+ library(SqlRender) + sql <- readSql("AfStrokeCohorts.sql") + sql <- renderSql(sql, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortsDatabaseSchema = cohortsDatabaseSchema, + post_time = 30, + pre_time = 365)$sql + sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql + + connection <- connect(connectionDetails) + executeSql(connection, sql)
+

In this code, we first read the SQL from the file into memory. In the +next line, we replace four parameter names with the actual values. We +then translate the SQL into the dialect appropriate for the DBMS we +already specified in the connectionDetails. Next, we +connect to the server, and submit the rendered and translated SQL.

+

If all went well, we now have a table with the events of interest. We +can see how many events per type:

-  sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count",
-  "FROM @cohortsDatabaseSchema.AFibStrokeCohort",
-  "GROUP BY cohort_definition_id")
-  sql <- renderSql(sql, cohortsDatabaseSchema = cohortsDatabaseSchema)$sql
-  sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql
-  
-  querySql(connection, sql)
-
##   cohort_definition_id  count
-## 1                    1 527616
-## 2                    2 221555
-
-
-

-Study script creation

-

In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier.

-
-
-

-Data extraction

-

Now we can tell PatientLevelPrediction to extract all necessary data for our analysis. This is done using the FeatureExtractionPackage. In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its vignettes. For our example study we decided to use these settings:

+ sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", + "FROM @cohortsDatabaseSchema.AFibStrokeCohort", + "GROUP BY cohort_definition_id") + sql <- renderSql(sql, cohortsDatabaseSchema = cohortsDatabaseSchema)$sql + sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql + + querySql(connection, sql)
+
##   cohort_definition_id  count
+## 1                    1 527616
+## 2                    2 221555
+
+
+

Study script creation +

+

In this section we assume that our cohorts have been created either +by using ATLAS or a custom SQL script. We will first explain how to +create an R script yourself that will execute our study as we have +defined earlier.

+
+
+

Data extraction +

+

Now we can tell PatientLevelPrediction to extract all +necessary data for our analysis. This is done using the FeatureExtractionPackage. +In short the FeatureExtractionPackage allows you to specify which +features (covariates) need to be extracted, e.g. all conditions and drug +exposures. It also supports the creation of custom covariates. For more +detailed information on the FeatureExtraction package see its vignettes. For our +example study we decided to use these settings:

-  covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE,
-  useDemographicsAge = TRUE,
-  useConditionGroupEraLongTerm = TRUE,
-  useConditionGroupEraAnyTimePrior = TRUE,
-  useDrugGroupEraLongTerm = TRUE,
-  useDrugGroupEraAnyTimePrior = TRUE,
-  useVisitConceptCountLongTerm = TRUE,
-  longTermStartDays = -365,
-  endDays = -1)
-

The final step for extracting the data is to run the getPlpData function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings.

+ covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useConditionGroupEraLongTerm = TRUE, + useConditionGroupEraAnyTimePrior = TRUE, + useDrugGroupEraLongTerm = TRUE, + useDrugGroupEraAnyTimePrior = TRUE, + useVisitConceptCountLongTerm = TRUE, + longTermStartDays = -365, + endDays = -1)
+

The final step for extracting the data is to run the +getPlpData function and input the connection details, the +database schema where the cohorts are stored, the cohort definition ids +for the cohort and outcome, and the washoutPeriod which is the minimum +number of days prior to cohort index date that the person must have been +observed to be included into the data, and finally input the previously +constructed covariate settings.

-databaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  cdmDatabaseName = '',
-  cohortDatabaseSchema = resultsDatabaseSchema,
-  cohortTable = 'AFibStrokeCohort',
-  cohortId = 1,
-  outcomeDatabaseSchema = resultsDatabaseSchema,
-  outcomeTable = 'AFibStrokeCohort',
-  outcomeIds = 2,
-  cdmVersion = 5
-  )
-
-# here you can define whether you want to sample the target cohort and add any
-# restrictions based on minimum prior observation, index date restrictions
-# or restricting to first index date (if people can be in target cohort multiple times)
-restrictPlpDataSettings <- createRestrictPlpDataSettings(sampleSize = 10000)
-
-  plpData <- getPlpData(
-    databaseDetails = databaseDetails, 
-    covariateSettings = covariateSettings,
-    restrictPlpDataSettings = restrictPlpDataSettings
-  )
-

Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the createRestrictPlpDataSettings function which are all documented in the PatientLevelPrediction manual. The resulting plpData object uses the package Andromeda (which uses SQLite) to store information in a way that ensures R does not run out of memory, even when the data are large.

-

Creating the plpData object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because plpData uses Andromeda, we cannot use R’s regular save function. Instead, we’ll have to use the savePlpData() function:

+databaseDetails <- createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cdmDatabaseName = '', + cohortDatabaseSchema = resultsDatabaseSchema, + cohortTable = 'AFibStrokeCohort', + cohortId = 1, + outcomeDatabaseSchema = resultsDatabaseSchema, + outcomeTable = 'AFibStrokeCohort', + outcomeIds = 2, + cdmVersion = 5 + ) + +# here you can define whether you want to sample the target cohort and add any +# restrictions based on minimum prior observation, index date restrictions +# or restricting to first index date (if people can be in target cohort multiple times) +restrictPlpDataSettings <- createRestrictPlpDataSettings(sampleSize = 10000) + + plpData <- getPlpData( + databaseDetails = databaseDetails, + covariateSettings = covariateSettings, + restrictPlpDataSettings = restrictPlpDataSettings + )
+

Note that if the cohorts are created in ATLAS its corresponding +cohort database schema needs to be selected. There are many additional +parameters for the createRestrictPlpDataSettings function +which are all documented in the PatientLevelPrediction +manual. The resulting plpData object uses the package +Andromeda (which uses SQLite) to store +information in a way that ensures R does not run out of memory, even +when the data are large.

+

Creating the plpData object can take considerable +computing time, and it is probably a good idea to save it for future +sessions. Because plpData uses Andromeda, we +cannot use R’s regular save function. Instead, we’ll have to use the +savePlpData() function:

-savePlpData(plpData, "stroke_in_af_data")
-

We can use the loadPlpData() function to load the data in a future session.

-
-
-

-Additional inclusion criteria

-

To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (requireTimeAtRisk, minTimeAtRisk) and we can specify if this also applies to patients with the outcome (includeAllOutcomes). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set riskWindowStart = 30 and riskWindowEnd = 365. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting addExposureToStart = TRUE which adds the cohort (exposure) time to the start date.

-

In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population.

-

In the example below all the settings we defined for our study are imposed:

+savePlpData(plpData, "stroke_in_af_data")
+

We can use the loadPlpData() function to load the data +in a future session.

+
+
+

Additional inclusion criteria +

+

To completely define the prediction problem the final study +population is obtained by applying additional constraints on the two +earlier defined cohorts, e.g., a minumim time at risk can be enforced +(requireTimeAtRisk, minTimeAtRisk) and we can specify if +this also applies to patients with the outcome +(includeAllOutcomes). Here we also specify the start and +end of the risk window relative to target cohort start. For example, if +we like the risk window to start 30 days after the at-risk cohort start +and end a year later we can set riskWindowStart = 30 and +riskWindowEnd = 365. In some cases the risk window needs to +start at the cohort end date. This can be achieved by setting +addExposureToStart = TRUE which adds the cohort (exposure) +time to the start date.

+

In Appendix 1, we demonstrate the effect of these settings on the +subset of the persons in the target cohort that end up in the final +study population.

+

In the example below all the settings we defined for our study are +imposed:

-  populationSettings <- createStudyPopulationSettings(
-  washoutPeriod = 1095,
-  firstExposureOnly = FALSE,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 1,
-  riskWindowStart = 1,
-  riskWindowEnd = 365,
-  startAnchor =  'cohort start',
-  endAnchor =  'cohort start',
-  minTimeAtRisk = 364,
-  requireTimeAtRisk = TRUE,
-  includeAllOutcomes = TRUE
-  )
-
-
-

-Spliting the data into training/validation/testing datasets

-

When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required).

-

In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see our BMJ open paper.

-

In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline:

+ populationSettings <- createStudyPopulationSettings( + washoutPeriod = 1095, + firstExposureOnly = FALSE, + removeSubjectsWithPriorOutcome = FALSE, + priorOutcomeLookback = 1, + riskWindowStart = 1, + riskWindowEnd = 365, + startAnchor = 'cohort start', + endAnchor = 'cohort start', + minTimeAtRisk = 364, + requireTimeAtRisk = TRUE, + includeAllOutcomes = TRUE + )
+
+
+

Spliting the data into training/validation/testing datasets +

+

When developing a prediction model using supervised learning (when +you have features paired with labels for a set of patients), the first +step is to design the development/internal validation process. This +requires specifying how to select the model hyper-parameters, how to +learn the model parameters and how to fairly evaluate the model. In +general, the validation set is used to pick hyper-parameters, the +training set is used to learn the model parameters and the test set is +used to perform fair internal validation. However, cross-validation can +be implemented to pick the hyper-parameters on the training data (so a +validation data set is not required). Cross validation can also be used +to estimate internal validation (so a testing data set is not +required).

+

In small data the best approach for internal validation has been +shown to be boostrapping. However, in big data (many patients and many +features) bootstrapping is generally not feasible. In big data our +research has shown that it is just important to have some form of fair +evaluation (use a test set or cross validation). For full details see our BMJ open paper.

+

In the PatientLevelPrediction package, the splitSettings define how +the plpData are partitioned into training/validation/testing data. Cross +validation is always done, but using a test set is optional (when the +data are small, it may be optimal to not use a test set). For the +splitSettings we can use the type (stratified/time/subject) and +testFraction parameters to split the data in a 75%-25% split and run the +patient-level prediction pipeline:

-  splitSettings <- createDefaultSplitSetting(
-    trainFraction = 0.75,
-    testFraction = 0.25,
-    type = 'stratified',
-    nfold = 2, 
-    splitSeed = 1234
-    )
-

Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see vignette for custom splitting.

-
-
-

-Preprocessing the training data

-

There a numerous data processing settings that a user must specify when developing a prediction model. These are: * Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) * Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) * Whether to remove redundant features and normalize the data (this is required for some models)

-

The default sample settings does nothing, it simply returns the trainData as input, see below:

+ splitSettings <- createDefaultSplitSetting( + trainFraction = 0.75, + testFraction = 0.25, + type = 'stratified', + nfold = 2, + splitSeed = 1234 + )
+

Note: it is possible to add a custom method to specify how the +plpData are partitioned into training/validation/testing data, see vignette +for custom splitting.

+
+
+

Preprocessing the training data +

+

There a numerous data processing settings that a user must specify +when developing a prediction model. These are: * Whether to under-sample +or over-sample the training data (this may be useful when there is class +imballance (e.g., the outcome is very rare or very common)) * Whether to +perform feature engineering or feature selection (e.g., create latent +variables that are not observed in the data or reduce the dimensionality +of the data) * Whether to remove redundant features and normalize the +data (this is required for some models)

+

The default sample settings does nothing, it simply returns the +trainData as input, see below:

-  sampleSettings <- createSampleSettings()
-

However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the type input should be ‘underSample’ and numberOutcomestoNonOutcomes must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see vignette for custom sampling.

-

It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing:

+ sampleSettings <- createSampleSettings()
+

However, the current package contains methods of under-sampling the +non-outcome patients. To perform undersampling, the type +input should be ‘underSample’ and +numberOutcomestoNonOutcomes must be specified (an integer +specifying the number of non-outcomes per outcome). It is possible to +add any custom function for over/under sampling, see vignette +for custom sampling.

+

It is possible to specify a combination of feature engineering +functions that take as input the trainData and output a new trainData +with different features. The default feature engineering setting does +nothing:

-  featureEngineeringSettings <- createFeatureEngineeringSettings()
-

However, it is possible to add custom feature engineering functions into the pipeline, see vignette for custom feature engineering.

-

Finally, the preprocessing setting is required. For this setting the user can define minFraction, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if minFraction = 0.01 then any feature that is seen in less than 1 percent of the target population is removed. The input normalize specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input removeRedundancy specifies whether features that are observed in all of the target population are removed.

+ featureEngineeringSettings <- createFeatureEngineeringSettings()
+

However, it is possible to add custom feature engineering functions +into the pipeline, see vignette +for custom feature engineering.

+

Finally, the preprocessing setting is required. For this setting the +user can define minFraction, this removes any features that +is observed in the training data for less than 0.01 fraction of the +patients. So, if minFraction = 0.01 then any feature that +is seen in less than 1 percent of the target population is removed. The +input normalize specifies whether the features are scaled +between 0 and 1, this is required for certain models (e.g., LASSO +logistic regression). The input removeRedundancy specifies +whether features that are observed in all of the target population are +removed.

-  preprocessSettingsSettings <- createPreprocessSettings(
-    minFraction = 0.01, 
-    normalize = T, 
-    removeRedundancy = T
-      )
-
-
-

-Model Development

-

In the set function of an algorithm the user can specify a list of eligible values for each hyper-parameter. All possible combinations of the hyper-parameters are included in a so-called grid search using cross-validation on the training set. If a user does not specify any value then the default value is used instead.

-

For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters

+ preprocessSettingsSettings <- createPreprocessSettings( + minFraction = 0.01, + normalize = T, + removeRedundancy = T + )
+
+
+

Model Development +

+

In the set function of an algorithm the user can specify a list of +eligible values for each hyper-parameter. All possible combinations of +the hyper-parameters are included in a so-called grid search using +cross-validation on the training set. If a user does not specify any +value then the default value is used instead.

+

For example, if we use the following settings for the +gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search +will apply the gradient boosting machine algorithm with ntrees=100 and +maxDepth=4 plus the default settings for other hyper-parameters and +ntrees=200 and maxDepth=4 plus the default settings for other +hyper-parameters. The hyper-parameters that lead to the +bestcross-validation performance will then be chosen for the final +model. For our problem we choose to build a logistic regression model +with the default hyper-parameters

-

The runPlP function requires the plpData, the outcomeId specifying the outcome being predicted and the settings: populationSettings, splitSettings, sampleSettings, featureEngineeringSettings, preprocessSettings and modelSettings to train and evaluate the model.

+lrModel <- setLassoLogisticRegression()
+

The runPlP function requires the plpData, +the outcomeId specifying the outcome being predicted and +the settings: populationSettings, +splitSettings, sampleSettings, +featureEngineeringSettings, preprocessSettings +and modelSettings to train and evaluate the model.

-  lrResults <- runPlp(
-    plpData = plpData,
-    outcomeId = 2, 
-    analysisId = 'singleDemo',
-    analysisName = 'Demonstration of runPlp for training single PLP models',
-    populationSettings = populationSettings, 
-    splitSettings = splitSettings,
-    sampleSettings = sampleSettings, 
-    featureEngineeringSettings = featureEngineeringSettings, 
-    preprocessSettings = preprocessSettings,
-    modelSettings = lrModel,
-    logSettings = createLogSettings(), 
-    executeSettings = createExecuteSettings(
-      runSplitData = T, 
-      runSampleData = T, 
-      runfeatureEngineering = T, 
-      runPreprocessData = T, 
-      runModelDevelopment = T, 
-      runCovariateSummary = T
-    ), 
-    saveDirectory = file.path(getwd(), 'singlePlp')
-    )
-

Under the hood the package will now use the Cyclops package to fit a large-scale regularized regression using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc.

+ lrResults <- runPlp( + plpData = plpData, + outcomeId = 2, + analysisId = 'singleDemo', + analysisName = 'Demonstration of runPlp for training single PLP models', + populationSettings = populationSettings, + splitSettings = splitSettings, + sampleSettings = sampleSettings, + featureEngineeringSettings = featureEngineeringSettings, + preprocessSettings = preprocessSettings, + modelSettings = lrModel, + logSettings = createLogSettings(), + executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = T, + runfeatureEngineering = T, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = T + ), + saveDirectory = file.path(getwd(), 'singlePlp') + )
+

Under the hood the package will now use the Cyclops package to +fit a large-scale regularized regression using 75% of the data and will +evaluate the model on the remaining 25%. A results data structure is +returned containing information about the model, its performance +etc.

You can save the model using:

-savePlpModel(lrResults$model, dirPath = file.path(getwd(), "model"))
+savePlpModel(lrResults$model, dirPath = file.path(getwd(), "model"))

You can load the model using:

-plpModel <- loadPlpModel(file.path(getwd(), "model"))
+plpModel <- loadPlpModel(file.path(getwd(), "model"))

You can also save the full results structure using:

-savePlpResult(lrResults, location = file.path(getwd(), "lr"))
+savePlpResult(lrResults, location = file.path(getwd(), "lr"))

To load the full results structure use:

-lrResults <- loadPlpResult(file.path(getwd(), "lr"))
+lrResults <- loadPlpResult(file.path(getwd(), "lr"))

-
-

-Example 2: Angioedema in ACE inhibitor users

-
-

-Study Specification

+
+

Example 2: Angioedema in ACE inhibitor users +

+
+

Study Specification +

--++ @@ -734,11 +1183,13 @@

- + - + @@ -790,11 +1241,13 @@

- + - + @@ -802,504 +1255,812 @@

Definition
Target Cohort (T)‘Patients who are newly dispensed an ACE inhibitor’ defined as the first drug record of any ACE inhibitor‘Patients who are newly dispensed an ACE inhibitor’ defined as the +first drug record of any ACE inhibitor
Outcome Cohort (O)‘Angioedema’ defined as an angioedema condition record during an inpatient or ER visit‘Angioedema’ defined as an angioedema condition record during an +inpatient or ER visit
Time-at-risk (TAR)
Hyper-parametersntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or 0.1 or 0.9ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 +or 0.1 or 0.9
CovariatesGender, Age, Conditions (ever before, <365), Drugs Groups (ever before, <365), and Visit CountGender, Age, Conditions (ever before, <365), Drugs Groups (ever +before, <365), and Visit Count
Data split
-

According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later.

-
-
-

-Study implementation

-

Now we have completely design our study we have to implement the study. We have to generate the target and outcome cohorts and we need to develop the R code to run against our CDM that will execute the full study.

-
-

-Cohort instantiation

-

For our study we need to know when a person enters the target and outcome cohorts. This is stored in a table on the server that contains the cohort start date and cohort end date for all subjects for a specific cohort definition. This cohort table has a very simple structure as shown below:

+

According to the best practices we need to make a protocol that +completely specifies how we plan to execute our study. This protocol +will be assessed by the governance boards of the participating data +sources in your network study. For this a template could be used but we +prefer to automate this process as much as possible by adding +functionality to automatically generate study protocol from a study +specification. We will discuss this in more detail later.

+
+
+

Study implementation +

+

Now we have completely design our study we have to implement the +study. We have to generate the target and outcome cohorts and we need to +develop the R code to run against our CDM that will execute the full +study.

+
+

Cohort instantiation +

+

For our study we need to know when a person enters the target and +outcome cohorts. This is stored in a table on the server that contains +the cohort start date and cohort end date for all subjects for a +specific cohort definition. This cohort table has a very simple +structure as shown below:

  • -cohort_definition_id, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts.
  • +cohort_definition_id, a unique identifier for +distinguishing between different types of cohorts, e.g. cohorts of +interest and outcome cohorts.
  • -subject_id, a unique identifier corresponding to the person_id in the CDM.
  • +subject_id, a unique identifier corresponding to the +person_id in the CDM.
  • -cohort_start_date, the date the subject enters the cohort.
  • +cohort_start_date, the date the subject enters the +cohort.
  • -cohort_end_date, the date the subject leaves the cohort.
  • +cohort_end_date, the date the subject leaves the +cohort.
-

How do we fill this table according to our cohort definitions? There are two options for this:

+

How do we fill this table according to our cohort definitions? There +are two options for this:

    -
  1. use the interactive cohort builder tool in ATLAS which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table.

  2. -
  3. write your own custom SQL statements to fill the cohort table.

  4. +
  5. use the interactive cohort builder tool in ATLAS which can be used to create +cohorts based on inclusion criteria and will automatically populate this +cohort table.

  6. +
  7. write your own custom SQL statements to fill the cohort +table.

-

Both methods are described below for our example prediction problem.

+

Both methods are described below for our example prediction +problem.

-
-

-ATLAS cohort builder

-
-

Target Cohort ACE inhibitors

+
+

ATLAS cohort builder +

+
+Target Cohort ACE inhibitors
Target Cohort ACE inhibitors
-

ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person’s episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 6 shows how we created the ACE inhibitors cohort and Figure 7 shows how we created the angioedema cohort in ATLAS.

-
-

Outcome Cohort Angioedema

+

ATLAS allows you to define cohorts interactively by specifying cohort +entry and cohort exit criteria. Cohort entry criteria involve selecting +one or more initial events, which determine the start date for cohort +entry, and optionally specifying additional inclusion criteria which +filter to the qualifying events. Cohort exit criteria are applied to +each cohort entry record to determine the end date when the person’s +episode no longer qualifies for the cohort. For the outcome cohort the +end date is less relevant. As an example, Figure 6 shows how we created +the ACE inhibitors cohort and Figure 7 shows how we created the +angioedema cohort in ATLAS.

+
+Outcome Cohort Angioedema
Outcome Cohort Angioedema

The T and O cohorts can be found here:

-

In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages (link).

-

Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1770617 in Figure 6.

-
-
-

-Custom cohorts

-

It is also possible to create cohorts without the use of ATLAS. Using custom cohort code (SQL) you can make more advanced cohorts if needed.

-

For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called AceAngioCohorts.sql with the following contents:

-
  /***********************************
-    File AceAngioCohorts.sql 
-  ***********************************/
-    /*
-    Create a table to store the persons in the T and C cohort
-  */
-    
-    IF OBJECT_ID('@resultsDatabaseSchema.PLPAceAngioCohort', 'U') IS NOT NULL 
-  DROP TABLE @resultsDatabaseSchema.PLPAceAngioCohort;
-  
-  CREATE TABLE @resultsDatabaseSchema.PLPAceAngioCohort 
-  ( 
-    cohort_definition_id INT, 
-    subject_id BIGINT,
-    cohort_start_date DATE, 
-    cohort_end_date DATE
-  );
-  
-  
-  /*
-    T cohort:  [PatientLevelPrediction vignette]:  T : patients who are newly 
-  dispensed an ACE inhibitor
-  - persons with a drug exposure record of any 'ACE inhibitor' or 
-  any descendants, indexed at the first diagnosis
-  - who have >364 days of prior observation before their first dispensing
-  */
-    INSERT INTO @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, 
-                                                       subject_id, 
-                                                       cohort_start_date, 
-                                                       cohort_end_date)
-  SELECT 1 AS cohort_definition_id,
-  Ace.person_id AS subject_id,
-  Ace.drug_start_date AS cohort_start_date,
-  observation_period.observation_period_end_date AS cohort_end_date
-  FROM
-  (
-    SELECT person_id, min(drug_exposure_date) as drug_start_date
-    FROM @cdmDatabaseSchema.drug_exposure
-    WHERE drug_concept_id IN (SELECT descendant_concept_id FROM 
-                              @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-                              (1342439,1334456, 1331235, 1373225, 1310756, 1308216, 1363749, 1341927, 1340128, 1335471 /*ace inhibitors*/))
-    GROUP BY person_id
-  ) Ace
-  INNER JOIN @cdmDatabaseSchema.observation_period
-  ON Ace.person_id = observation_period.person_id
-  AND Ace.drug_start_date >= dateadd(dd,364, 
-                                     observation_period.observation_period_start_date)
-  AND Ace.drug_start_date <= observation_period.observation_period_end_date
-  ;
-  
-  /*
-    C cohort:  [PatientLevelPrediction vignette]:  O: Angioedema
-  */
-    INSERT INTO @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, 
-                                                       subject_id, 
-                                                       cohort_start_date, 
-                                                       cohort_end_date)
-  SELECT 2 AS cohort_definition_id,
-  angioedema.person_id AS subject_id,
-  angioedema.condition_start_date AS cohort_start_date,
-  angioedema.condition_start_date AS cohort_end_date
-  FROM  
-  (
-    SELECT person_id, condition_start_date
-    FROM @cdmDatabaseSchema.condition_occurrence
-    WHERE condition_concept_id IN (SELECT DISTINCT descendant_concept_id FROM 
-                                   @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
-                                   (432791 /*angioedema*/) OR descendant_concept_id IN 
-                                   (432791 /*angioedema*/)
-    ) angioedema
-    
-    ;
-    
-

This is parameterized SQL which can be used by the SqlRender package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in SqlRender, we can make sure the SQL code can be run in many different environments.

-

To execute this sql against our CDM we first need to tell R how to connect to the server. PatientLevelPrediction uses the DatabaseConnector package, which provides a function called createConnectionDetails. Type ?createConnectionDetails for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code:

+

In depth explanation of cohort creation in ATLAS is out of scope of +this vignette but can be found on the OHDSI wiki pages (link).

+

Note that when a cohort is created in ATLAS the cohortid is needed to +extract the data in R. The cohortid can be found at the top of the ATLAS +screen, e.g. 1770617 in Figure 6.

+
+
+

Custom cohorts +

+

It is also possible to create cohorts without the use of ATLAS. Using +custom cohort code (SQL) you can make more advanced cohorts if +needed.

+

For our example study, we need to create at table to hold the cohort +data and we need to create SQL code to instantiate this table for both +the AF and Stroke cohorts. Therefore, we create a file called +AceAngioCohorts.sql with the following contents:

+
  /***********************************
+    File AceAngioCohorts.sql 
+  ***********************************/
+    /*
+    Create a table to store the persons in the T and C cohort
+  */
+    
+    IF OBJECT_ID('@resultsDatabaseSchema.PLPAceAngioCohort', 'U') IS NOT NULL 
+  DROP TABLE @resultsDatabaseSchema.PLPAceAngioCohort;
+  
+  CREATE TABLE @resultsDatabaseSchema.PLPAceAngioCohort 
+  ( 
+    cohort_definition_id INT, 
+    subject_id BIGINT,
+    cohort_start_date DATE, 
+    cohort_end_date DATE
+  );
+  
+  
+  /*
+    T cohort:  [PatientLevelPrediction vignette]:  T : patients who are newly 
+  dispensed an ACE inhibitor
+  - persons with a drug exposure record of any 'ACE inhibitor' or 
+  any descendants, indexed at the first diagnosis
+  - who have >364 days of prior observation before their first dispensing
+  */
+    INSERT INTO @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, 
+                                                       subject_id, 
+                                                       cohort_start_date, 
+                                                       cohort_end_date)
+  SELECT 1 AS cohort_definition_id,
+  Ace.person_id AS subject_id,
+  Ace.drug_start_date AS cohort_start_date,
+  observation_period.observation_period_end_date AS cohort_end_date
+  FROM
+  (
+    SELECT person_id, min(drug_exposure_date) as drug_start_date
+    FROM @cdmDatabaseSchema.drug_exposure
+    WHERE drug_concept_id IN (SELECT descendant_concept_id FROM 
+                              @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
+                              (1342439,1334456, 1331235, 1373225, 1310756, 1308216, 1363749, 1341927, 1340128, 1335471 /*ace inhibitors*/))
+    GROUP BY person_id
+  ) Ace
+  INNER JOIN @cdmDatabaseSchema.observation_period
+  ON Ace.person_id = observation_period.person_id
+  AND Ace.drug_start_date >= dateadd(dd,364, 
+                                     observation_period.observation_period_start_date)
+  AND Ace.drug_start_date <= observation_period.observation_period_end_date
+  ;
+  
+  /*
+    C cohort:  [PatientLevelPrediction vignette]:  O: Angioedema
+  */
+    INSERT INTO @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, 
+                                                       subject_id, 
+                                                       cohort_start_date, 
+                                                       cohort_end_date)
+  SELECT 2 AS cohort_definition_id,
+  angioedema.person_id AS subject_id,
+  angioedema.condition_start_date AS cohort_start_date,
+  angioedema.condition_start_date AS cohort_end_date
+  FROM  
+  (
+    SELECT person_id, condition_start_date
+    FROM @cdmDatabaseSchema.condition_occurrence
+    WHERE condition_concept_id IN (SELECT DISTINCT descendant_concept_id FROM 
+                                   @cdmDatabaseSchema.concept_ancestor WHERE ancestor_concept_id IN 
+                                   (432791 /*angioedema*/) OR descendant_concept_id IN 
+                                   (432791 /*angioedema*/)
+    ) angioedema
+    
+    ;
+    
+

This is parameterized SQL which can be used by the SqlRender +package. We use parameterized SQL so we do not have to pre-specify the +names of the CDM and result schemas. That way, if we want to run the SQL +on a different schema, we only need to change the parameter values; we +do not have to change the SQL code. By also making use of translation +functionality in SqlRender, we can make sure the SQL code +can be run in many different environments.

+

To execute this sql against our CDM we first need to tell R how to +connect to the server. PatientLevelPrediction uses the DatabaseConnector +package, which provides a function called +createConnectionDetails. Type +?createConnectionDetails for the specific settings required +for the various database management systems (DBMS). For example, one +might connect to a PostgreSQL database using this code:

-    connectionDetails <- createConnectionDetails(dbms = "postgresql", 
-                                                 server = "localhost/ohdsi", 
-                                                 user = "joe", 
-                                                 password = "supersecret")
-    
-    cdmDatabaseSchema <- "my_cdm_data"
-    cohortsDatabaseSchema <- "my_results"
-    cdmVersion <- "5"
-

The last three lines define the cdmDatabaseSchema and cohortsDatabaseSchema variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example cdmDatabaseSchema <- "my_cdm_data.dbo".

+ connectionDetails <- createConnectionDetails(dbms = "postgresql", + server = "localhost/ohdsi", + user = "joe", + password = "supersecret") + + cdmDatabaseSchema <- "my_cdm_data" + cohortsDatabaseSchema <- "my_results" + cdmVersion <- "5"
+

The last three lines define the cdmDatabaseSchema and +cohortsDatabaseSchema variables, as well as the CDM +version. We will use these later to tell R where the data in CDM format +live, where we want to create the cohorts of interest, and what version +CDM is used. Note that for Microsoft SQL Server, databaseschemas need to +specify both the database and the schema, so for example +cdmDatabaseSchema <- "my_cdm_data.dbo".

-    library(SqlRender)
-    sql <- readSql("AceAngioCohorts.sql")
-    sql <- render(sql,
-                  cdmDatabaseSchema = cdmDatabaseSchema,
-                  cohortsDatabaseSchema = cohortsDatabaseSchema)
-    sql <- translate(sql, targetDialect = connectionDetails$dbms)
-    
-    connection <- connect(connectionDetails)
-    executeSql(connection, sql)
-

In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the connectionDetails. Next, we connect to the server, and submit the rendered and translated SQL.

-

If all went well, we now have a table with the events of interest. We can see how many events per type:

+ library(SqlRender) + sql <- readSql("AceAngioCohorts.sql") + sql <- render(sql, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortsDatabaseSchema = cohortsDatabaseSchema) + sql <- translate(sql, targetDialect = connectionDetails$dbms) + + connection <- connect(connectionDetails) + executeSql(connection, sql)
+

In this code, we first read the SQL from the file into memory. In the +next line, we replace four parameter names with the actual values. We +then translate the SQL into the dialect appropriate for the DBMS we +already specified in the connectionDetails. Next, we +connect to the server, and submit the rendered and translated SQL.

+

If all went well, we now have a table with the events of interest. We +can see how many events per type:

-    sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count",
-                 "FROM @cohortsDatabaseSchema.AceAngioCohort",
-                 "GROUP BY cohort_definition_id")
-    sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema)
-    sql <- translate(sql, targetDialect = connectionDetails$dbms)
-    
-    querySql(connection, sql)
-
##   cohort_definition_id count
-## 1                    1     0
-## 2                    2     0
-
-
-

-Study script creation

-

In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier.

-
-
-

-Data extraction

-

Now we can tell PatientLevelPrediction to extract all necessary data for our analysis. This is done using the FeatureExtractionPackage. In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its vignettes. For our example study we decided to use these settings:

+ sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", + "FROM @cohortsDatabaseSchema.AceAngioCohort", + "GROUP BY cohort_definition_id") + sql <- render(sql, cohortsDatabaseSchema = cohortsDatabaseSchema) + sql <- translate(sql, targetDialect = connectionDetails$dbms) + + querySql(connection, sql)
+
##   cohort_definition_id count
+## 1                    1     0
+## 2                    2     0
+
+
+

Study script creation +

+

In this section we assume that our cohorts have been created either +by using ATLAS or a custom SQL script. We will first explain how to +create an R script yourself that will execute our study as we have +defined earlier.

+
+
+

Data extraction +

+

Now we can tell PatientLevelPrediction to extract all +necessary data for our analysis. This is done using the FeatureExtractionPackage. +In short the FeatureExtractionPackage allows you to specify which +features (covariates) need to be extracted, e.g. all conditions and drug +exposures. It also supports the creation of custom covariates. For more +detailed information on the FeatureExtraction package see its vignettes. For our +example study we decided to use these settings:

-    covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE,
-                                                 useDemographicsAge = TRUE,
-                                                 useConditionGroupEraLongTerm = TRUE,
-                                                 useConditionGroupEraAnyTimePrior = TRUE,
-                                                 useDrugGroupEraLongTerm = TRUE,
-                                                 useDrugGroupEraAnyTimePrior = TRUE,
-                                                 useVisitConceptCountLongTerm = TRUE,
-                                                 longTermStartDays = -365,
-                                                 endDays = -1)
-

The final step for extracting the data is to run the getPlpData function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings.

+ covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useConditionGroupEraLongTerm = TRUE, + useConditionGroupEraAnyTimePrior = TRUE, + useDrugGroupEraLongTerm = TRUE, + useDrugGroupEraAnyTimePrior = TRUE, + useVisitConceptCountLongTerm = TRUE, + longTermStartDays = -365, + endDays = -1)
+

The final step for extracting the data is to run the +getPlpData function and input the connection details, the +database schema where the cohorts are stored, the cohort definition ids +for the cohort and outcome, and the washoutPeriod which is the minimum +number of days prior to cohort index date that the person must have been +observed to be included into the data, and finally input the previously +constructed covariate settings.

-databaseDetails <- createDatabaseDetails(
-  connectionDetails = connectionDetails,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  cohortDatabaseSchema = resultsDatabaseSchema,
-  cohortTable = 'AceAngioCohort',
-  cohortId = 1,
-  outcomeDatabaseSchema = resultsDatabaseSchema,
-  outcomeTable = 'AceAngioCohort',
-  outcomeIds = 2
-  )
-
-restrictPlpDataSettings <- createRestrictPlpDataSettings(
-  sampleSize = 10000
-  )
-
-plpData <- getPlpData(
-  databaseDetails = databaseDetails, 
-  covariateSettings = covariateSettings, 
-  restrictPlpDataSettings = restrictPlpDataSettings
-  )
-

Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the getPlpData function which are all documented in the PatientLevelPrediction manual. The resulting plpData object uses the package ff to store information in a way that ensures R does not run out of memory, even when the data are large.

-

Creating the plpData object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because plpData uses ff, we cannot use R’s regular save function. Instead, we’ll have to use the savePlpData() function:

+databaseDetails <- createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = resultsDatabaseSchema, + cohortTable = 'AceAngioCohort', + cohortId = 1, + outcomeDatabaseSchema = resultsDatabaseSchema, + outcomeTable = 'AceAngioCohort', + outcomeIds = 2 + ) + +restrictPlpDataSettings <- createRestrictPlpDataSettings( + sampleSize = 10000 + ) + +plpData <- getPlpData( + databaseDetails = databaseDetails, + covariateSettings = covariateSettings, + restrictPlpDataSettings = restrictPlpDataSettings + )
+

Note that if the cohorts are created in ATLAS its corresponding +cohort database schema needs to be selected. There are many additional +parameters for the getPlpData function which are all +documented in the PatientLevelPrediction manual. The +resulting plpData object uses the package ff +to store information in a way that ensures R does not run out of memory, +even when the data are large.

+

Creating the plpData object can take considerable +computing time, and it is probably a good idea to save it for future +sessions. Because plpData uses ff, we cannot +use R’s regular save function. Instead, we’ll have to use the +savePlpData() function:

-savePlpData(plpData, "angio_in_ace_data")
-

We can use the loadPlpData() function to load the data in a future session.

-
-
-

-Additional inclusion criteria

-

To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (requireTimeAtRisk, minTimeAtRisk) and we can specify if this also applies to patients with the outcome (includeAllOutcomes). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set riskWindowStart = 30 and riskWindowEnd = 365. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting addExposureToStart = TRUE which adds the cohort (exposure) time to the start date.

-

In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population.

-

In the example below all the settings we defined for our study are imposed:

+savePlpData(plpData, "angio_in_ace_data")
+

We can use the loadPlpData() function to load the data +in a future session.

+
+
+

Additional inclusion criteria +

+

To completely define the prediction problem the final study +population is obtained by applying additional constraints on the two +earlier defined cohorts, e.g., a minumim time at risk can be enforced +(requireTimeAtRisk, minTimeAtRisk) and we can specify if +this also applies to patients with the outcome +(includeAllOutcomes). Here we also specify the start and +end of the risk window relative to target cohort start. For example, if +we like the risk window to start 30 days after the at-risk cohort start +and end a year later we can set riskWindowStart = 30 and +riskWindowEnd = 365. In some cases the risk window needs to +start at the cohort end date. This can be achieved by setting +addExposureToStart = TRUE which adds the cohort (exposure) +time to the start date.

+

In Appendix 1, we demonstrate the effect of these settings on the +subset of the persons in the target cohort that end up in the final +study population.

+

In the example below all the settings we defined for our study are +imposed:

-    populationSettings <- createStudyPopulationSettings(
-      washoutPeriod = 364,
-      firstExposureOnly = FALSE,
-      removeSubjectsWithPriorOutcome = TRUE,
-      priorOutcomeLookback = 9999,
-      riskWindowStart = 1,
-      riskWindowEnd = 365, 
-      minTimeAtRisk = 364,
-      startAnchor = 'cohort start',
-      endAnchor = 'cohort start',
-      requireTimeAtRisk = TRUE,
-      includeAllOutcomes = TRUE
-    )
-
-
-

-Spliting the data into training/validation/testing datasets

-

When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required).

-

In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see our BMJ open paper.

-

In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline:

+ populationSettings <- createStudyPopulationSettings( + washoutPeriod = 364, + firstExposureOnly = FALSE, + removeSubjectsWithPriorOutcome = TRUE, + priorOutcomeLookback = 9999, + riskWindowStart = 1, + riskWindowEnd = 365, + minTimeAtRisk = 364, + startAnchor = 'cohort start', + endAnchor = 'cohort start', + requireTimeAtRisk = TRUE, + includeAllOutcomes = TRUE + )
+
+
+

Spliting the data into training/validation/testing datasets +

+

When developing a prediction model using supervised learning (when +you have features paired with labels for a set of patients), the first +step is to design the development/internal validation process. This +requires specifying how to select the model hyper-parameters, how to +learn the model parameters and how to fairly evaluate the model. In +general, the validation set is used to pick hyper-parameters, the +training set is used to learn the model parameters and the test set is +used to perform fair internal validation. However, cross-validation can +be implemented to pick the hyper-parameters on the training data (so a +validation data set is not required). Cross validation can also be used +to estimate internal validation (so a testing data set is not +required).

+

In small data the best approach for internal validation has been +shown to be boostrapping. However, in big data (many patients and many +features) bootstrapping is generally not feasible. In big data our +research has shown that it is just important to have some form of fair +evaluation (use a test set or cross validation). For full details see our BMJ open paper.

+

In the PatientLevelPrediction package, the splitSettings define how +the plpData are partitioned into training/validation/testing data. Cross +validation is always done, but using a test set is optional (when the +data are small, it may be optimal to not use a test set). For the +splitSettings we can use the type (stratified/time/subject) and +testFraction parameters to split the data in a 75%-25% split and run the +patient-level prediction pipeline:

-  splitSettings <- createDefaultSplitSetting(
-    trainFraction = 0.75,
-    testFraction = 0.25,
-    type = 'stratified',
-    nfold = 2, 
-    splitSeed = 1234
-    )
-

Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see vignette for custom splitting.

-
-
-

-Preprocessing the training data

-

There a numerous data processing settings that a user must specify when developing a prediction model. These are: * Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) * Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) * Whether to remove redundant features and normalize the data (this is required for some models)

-

The default sample settings does nothing, it simply returns the trainData as input, see below:

+ splitSettings <- createDefaultSplitSetting( + trainFraction = 0.75, + testFraction = 0.25, + type = 'stratified', + nfold = 2, + splitSeed = 1234 + )
+

Note: it is possible to add a custom method to specify how the +plpData are partitioned into training/validation/testing data, see vignette +for custom splitting.

+
+
+

Preprocessing the training data +

+

There a numerous data processing settings that a user must specify +when developing a prediction model. These are: * Whether to under-sample +or over-sample the training data (this may be useful when there is class +imballance (e.g., the outcome is very rare or very common)) * Whether to +perform feature engineering or feature selection (e.g., create latent +variables that are not observed in the data or reduce the dimensionality +of the data) * Whether to remove redundant features and normalize the +data (this is required for some models)

+

The default sample settings does nothing, it simply returns the +trainData as input, see below:

-  sampleSettings <- createSampleSettings()
-

However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the type input should be ‘underSample’ and numberOutcomestoNonOutcomes must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see vignette for custom sampling.

-

It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing:

+ sampleSettings <- createSampleSettings()
+

However, the current package contains methods of under-sampling the +non-outcome patients. To perform undersampling, the type +input should be ‘underSample’ and +numberOutcomestoNonOutcomes must be specified (an integer +specifying the number of non-outcomes per outcome). It is possible to +add any custom function for over/under sampling, see vignette +for custom sampling.

+

It is possible to specify a combination of feature engineering +functions that take as input the trainData and output a new trainData +with different features. The default feature engineering setting does +nothing:

-  featureEngineeringSettings <- createFeatureEngineeringSettings()
-

However, it is possible to add custom feature engineering functions into the pipeline, see vignette for custom feature engineering.

-

Finally, the preprocessing setting is required. For this setting the user can define minFraction, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if minFraction = 0.01 then any feature that is seen in less than 1 percent of the target population is removed. The input normalize specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input removeRedundancy specifies whether features that are observed in all of the target population are removed.

+ featureEngineeringSettings <- createFeatureEngineeringSettings()

+

However, it is possible to add custom feature engineering functions +into the pipeline, see vignette +for custom feature engineering.

+

Finally, the preprocessing setting is required. For this setting the +user can define minFraction, this removes any features that +is observed in the training data for less than 0.01 fraction of the +patients. So, if minFraction = 0.01 then any feature that +is seen in less than 1 percent of the target population is removed. The +input normalize specifies whether the features are scaled +between 0 and 1, this is required for certain models (e.g., LASSO +logistic regression). The input removeRedundancy specifies +whether features that are observed in all of the target population are +removed.

-  preprocessSettingsSettings <- createPreprocessSettings(
-    minFraction = 0.01, 
-    normalize = T, 
-    removeRedundancy = T
-      )
-
-
-

-Model Development

-

In the set function of an algorithm the user can specify a list of eligible values for each hyper-parameter. All possible combinations of the hyper-parameters are included in a so-called grid search using cross-validation on the training set. If a user does not specify any value then the default value is used instead.

-

For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters

+ preprocessSettingsSettings <- createPreprocessSettings( + minFraction = 0.01, + normalize = T, + removeRedundancy = T + )
+
+
+

Model Development +

+

In the set function of an algorithm the user can specify a list of +eligible values for each hyper-parameter. All possible combinations of +the hyper-parameters are included in a so-called grid search using +cross-validation on the training set. If a user does not specify any +value then the default value is used instead.

+

For example, if we use the following settings for the +gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search +will apply the gradient boosting machine algorithm with ntrees=100 and +maxDepth=4 plus the default settings for other hyper-parameters and +ntrees=200 and maxDepth=4 plus the default settings for other +hyper-parameters. The hyper-parameters that lead to the +bestcross-validation performance will then be chosen for the final +model. For our problem we choose to build a logistic regression model +with the default hyper-parameters

-gbmModel <- setGradientBoostingMachine(ntrees = 5000, maxDepth = c(4, 7, 10), learnRate = c(0.001, 
-    0.01, 0.1, 0.9))
-

The runPlP function requires the plpData, the outcomeId specifying the outcome being predicted and the settings: populationSettings, splitSettings, sampleSettings, featureEngineeringSettings, preprocessSettings and modelSettings to train and evaluate the model.

+gbmModel <- setGradientBoostingMachine(ntrees = 5000, maxDepth = c(4, 7, 10), learnRate = c(0.001, + 0.01, 0.1, 0.9))
+

The runPlP function requires the plpData, +the outcomeId specifying the outcome being predicted and +the settings: populationSettings, +splitSettings, sampleSettings, +featureEngineeringSettings, preprocessSettings +and modelSettings to train and evaluate the model.

-  gbmResults <- runPlp(
-    plpData = plpData,
-    outcomeId = 2, 
-    analysisId = 'singleDemo2',
-    analysisName = 'Demonstration of runPlp for training single PLP models',
-    populationSettings = populationSettings, 
-    splitSettings = splitSettings,
-    sampleSettings = sampleSettings, 
-    featureEngineeringSettings = featureEngineeringSettings, 
-    preprocessSettings = preprocessSettings,
-    modelSettings = gbmModel,
-    logSettings = createLogSettings(), 
-    executeSettings = createExecuteSettings(
-      runSplitData = T, 
-      runSampleData = T, 
-      runfeatureEngineering = T, 
-      runPreprocessData = T, 
-      runModelDevelopment = T, 
-      runCovariateSummary = T
-    ), 
-    saveDirectory = file.path(getwd(), 'singlePlpExample2')
-    )
-

Under the hood the package will now use the R xgboost package to fit a a gradient boosting machine model using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc.

+ gbmResults <- runPlp( + plpData = plpData, + outcomeId = 2, + analysisId = 'singleDemo2', + analysisName = 'Demonstration of runPlp for training single PLP models', + populationSettings = populationSettings, + splitSettings = splitSettings, + sampleSettings = sampleSettings, + featureEngineeringSettings = featureEngineeringSettings, + preprocessSettings = preprocessSettings, + modelSettings = gbmModel, + logSettings = createLogSettings(), + executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = T, + runfeatureEngineering = T, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = T + ), + saveDirectory = file.path(getwd(), 'singlePlpExample2') + )
+

Under the hood the package will now use the R xgboost package to fit +a a gradient boosting machine model using 75% of the data and will +evaluate the model on the remaining 25%. A results data structure is +returned containing information about the model, its performance +etc.

You can save the model using:

-savePlpModel(gbmResults$model, dirPath = file.path(getwd(), "model"))
+savePlpModel(gbmResults$model, dirPath = file.path(getwd(), "model"))

You can load the model using:

-plpModel <- loadPlpModel(file.path(getwd(), "model"))
+plpModel <- loadPlpModel(file.path(getwd(), "model"))

You can also save the full results structure using:

-savePlpResult(gbmResults, location = file.path(getwd(), "gbm"))
+savePlpResult(gbmResults, location = file.path(getwd(), "gbm"))

To load the full results structure use:

-gbmResults <- loadPlpResult(file.path(getwd(), "gbm"))
+gbmResults <- loadPlpResult(file.path(getwd(), "gbm"))
-
-

-Study package creation

-

The script we created manually above can also be automatically created using a powerful feature in ATLAS. By creating a new prediction study (left menu) you can select the Target and Outcome as created in ATLAS, set all the study parameters, and then you can download a R package that you can use to execute your study. What is really powerful is that you can add multiple Ts, Os, covariate settings etc. The package will then run all the combinations of automatically as separate analyses. The screenshots below explain this process.

+
+

Study package creation +

+

The script we created manually above can also be automatically +created using a powerful feature in ATLAS. By creating a new prediction +study (left menu) you can select the Target and Outcome as created in +ATLAS, set all the study parameters, and then you can download a R +package that you can use to execute your study. What is really powerful +is that you can add multiple Ts, Os, covariate settings etc. The package +will then run all the combinations of automatically as separate +analyses. The screenshots below explain this process.

  1. -Create a new prediction study and select your target and outcome cohorts. +Create a new prediction study and select your target and outcome +cohorts.
    - +
  2. -
  3. +
  4. +
    Specify one or more analysis settings.
    - +
    -
  5. +
    +
    +
  6. Specify the trainings settigns
    - +
  7. -
  8. +
  9. +
    Specify the execution settings
    - +
    -
  10. +
    +]
-
-

ATLAS can build a R package for you that will execute the full study against you CDM. Below the steps are explained how to do this in ATLAS.

+

ATLAS can build a R package for you that will execute the full study +against you CDM. Below the steps are explained how to do this in +ATLAS.

  1. -

    Under utilities you can find download. Click on the button to review the full study specification

    +

    Under utilities you can find download. Click on the button to review +the full study specification

    -
    -

    R package download functionality in ATLAS

    +
    +R package download functionality in ATLAS
    R package download functionality in ATLAS
  2. -

    You now have to review that you indeed want to run all these analyses (cartesian product of all the settings for each T and O combination.

    +

    You now have to review that you indeed want to run all these analyses +(cartesian product of all the settings for each T and O combination.

    -
    -

    R package download functionality in ATLAS

    +
    +R package download functionality in ATLAS
    R package download functionality in ATLAS
  3. -
  4. If you agree, you give the package a name, and download the package as a zipfile.

  5. -
  6. By opening the R package in R studio and building the package you can run the study using the execute function. Theres is also an example CodeToRun.R script available in the extras folder of the package with extra instructions.

  7. +
  8. If you agree, you give the package a name, and download the +package as a zipfile.

  9. +
  10. By opening the R package in R studio and building the package you +can run the study using the execute function. Theres is +also an example CodeToRun.R script available in the extras folder of the +package with extra instructions.

-
-

-Internal validation

-

Once we execute the study, the runPlp() function returns the trained model and the evaluation of the model on the train/test sets.

-

You can interactively view the results by running: viewPlp(runPlp=lrResults). This will generate a Shiny App in your browser in which you can view all performance measures created by the framework as shown in the figure below.

+
+

Internal validation +

+

Once we execute the study, the runPlp() function returns the trained +model and the evaluation of the model on the train/test sets.

+

You can interactively view the results by running: +viewPlp(runPlp=lrResults). This will generate a Shiny App +in your browser in which you can view all performance measures created +by the framework as shown in the figure below.

-Summary of all the performance measures of the analyses +
+Summary of all the performance measures of the analyses
Summary of all the performance measures of the +analyses
+
-Furthermore, many interactive plots are available in the Shiny App, for example the ROC curve in which you can move over the plot to see the threshold and the corresponding sensitivity and specificity values. +

Furthermore, many interactive plots are available in the Shiny App, +for example the ROC curve in which you can move over the plot to see the +threshold and the corresponding sensitivity and specificity values.

-Example of the interactive ROC curve +
+Example of the interactive ROC curve
Example of the interactive ROC curve
+
-

To generate and save all the evaluation plots to a folder run the following code:

+

To generate and save all the evaluation plots to a folder run the +following code:

-plotPlp(lrResults, dirPath = getwd())
+plotPlp(lrResults, dirPath = getwd())

The plots are described in more detail in the next sections.

-
-

-Discrimination

-

The Receiver Operating Characteristics (ROC) plot shows the sensitivity against 1-specificity on the test set. The plot illustrates how well the model is able to discriminate between the people with the outcome and those without. The dashed diagonal line is the performance of a model that randomly assigns predictions. The higher the area under the ROC plot the better the discrimination of the model. The plot is created by changing the probability threshold to assign the positive class.

+
+

Discrimination +

+

The Receiver Operating Characteristics (ROC) plot shows the +sensitivity against 1-specificity on the test set. The plot illustrates +how well the model is able to discriminate between the people with the +outcome and those without. The dashed diagonal line is the performance +of a model that randomly assigns predictions. The higher the area under +the ROC plot the better the discrimination of the model. The plot is +created by changing the probability threshold to assign the positive +class.

-Receiver Operating Characteristic Plot +
+Receiver Operating Characteristic Plot
Receiver Operating Characteristic Plot
+
-

## Calibration

-

The calibration plot shows how close the predicted risk is to the observed risk. The diagonal dashed line thus indicates a perfectly calibrated model. The ten (or fewer) dots represent the mean predicted values for each quantile plotted against the observed fraction of people in that quantile who had the outcome (observed fraction). The straight black line is the linear regression using these 10 plotted quantile mean predicted vs observed fraction points. The straight vertical lines represented the 95% lower and upper confidence intervals of the slope of the fitted line.

+
+

## Calibration

+

The calibration plot shows how close the predicted risk is to the +observed risk. The diagonal dashed line thus indicates a perfectly +calibrated model. The ten (or fewer) dots represent the mean predicted +values for each quantile plotted against the observed fraction of people +in that quantile who had the outcome (observed fraction). The straight +black line is the linear regression using these 10 plotted quantile mean +predicted vs observed fraction points. The straight vertical lines +represented the 95% lower and upper confidence intervals of the slope of +the fitted line.

-Calibration Plot +
+Calibration Plot
Calibration Plot
+
-
-

-Smooth Calibration

-

Similar to the traditional calibration shown above the Smooth Calibration plot shows the relationship between predicted and observed risk. the major difference is that the smooth fit allows for a more fine grained examination of this. Whereas the traditional plot will be heavily influenced by the areas with the highest density of data the smooth plot will provide the same information for this region as well as a more accurate interpretation of areas with lower density. the plot also contains information on the distribution of the outcomes relative to predicted risk.

-

However, the increased information gain comes at a computational cost. It is recommended to use the traditional plot for examination and then to produce the smooth plot for final versions. To create the smooth calibarion plot you have to run the follow command:

+
+

Smooth Calibration +

+

Similar to the traditional calibration shown above the Smooth +Calibration plot shows the relationship between predicted and observed +risk. the major difference is that the smooth fit allows for a more fine +grained examination of this. Whereas the traditional plot will be +heavily influenced by the areas with the highest density of data the +smooth plot will provide the same information for this region as well as +a more accurate interpretation of areas with lower density. the plot +also contains information on the distribution of the outcomes relative +to predicted risk.

+

However, the increased information gain comes at a computational +cost. It is recommended to use the traditional plot for examination and +then to produce the smooth plot for final versions. To create the smooth +calibarion plot you have to run the follow command:

-

See the help function for more information, on how to set the smoothing method etc.

-

The example below is from another study that better demonstrates the impact of using a smooth calibration plot. The default line fit would not highlight the miss-calibration at the lower predicted probability levels that well.

+plotSmoothCalibration(lrResults)
+

See the help function for more information, on how to set the +smoothing method etc.

+

The example below is from another study that better demonstrates the +impact of using a smooth calibration plot. The default line fit would +not highlight the miss-calibration at the lower predicted probability +levels that well.

-Smooth Calibration plot +
+Smooth Calibration plot
Smooth Calibration plot
+
-

## Preference distribution

-

The preference distribution plots are the preference score distributions corresponding to i) people in the test set with the outcome (red) and ii) people in the test set without the outcome (blue).

+
+

## Preference distribution

+

The preference distribution plots are the preference score +distributions corresponding to i) people in the test set with the +outcome (red) and ii) people in the test set without the outcome +(blue).

-Preference Plot +
+Preference Plot
Preference Plot
+
-

## Predicted probability distribution

-

The prediction distribution box plots are for the predicted risks of the people in the test set with the outcome (class 1: blue) and without the outcome (class 0: red).

-

The box plots in the Figure show that the predicted probability of the outcome is indeed higher for those with the outcome but there is also overlap between the two distribution which lead to an imperfect discrimination.

+
+

## Predicted probability distribution

+

The prediction distribution box plots are for the predicted risks of +the people in the test set with the outcome (class 1: blue) and without +the outcome (class 0: red).

+

The box plots in the Figure show that the predicted probability of +the outcome is indeed higher for those with the outcome but there is +also overlap between the two distribution which lead to an imperfect +discrimination.

-Prediction Distribution Box Plot +
+Prediction Distribution Box Plot
Prediction Distribution Box Plot
+
-

## Test-Train similarity

-

The test-train similarity is assessed by plotting the mean covariate values in the train set against those in the test set for people with and without the outcome.

-

The results for our example of look very promising since the mean values of the covariates are on the diagonal.

+
+

## Test-Train similarity

+

The test-train similarity is assessed by plotting the mean covariate +values in the train set against those in the test set for people with +and without the outcome.

+

The results for our example of look very promising since the mean +values of the covariates are on the diagonal.

-Similarity plots of train and test set +
+Similarity plots of train and test set
Similarity plots of train and test set
+
-

## Variable scatter plot

-

The variable scatter plot shows the mean covariate value for the people with the outcome against the mean covariate value for the people without the outcome. The color of the dots corresponds to the inclusion (green) or exclusion in the model (blue), respectively. It is highly recommended to use the Shiny App since this allows you to hoover over a covariate to show more details (name, value etc).

-

The plot shows that the mean of most of the covariates is higher for subjects with the outcome compared to those without.

+
+

## Variable scatter plot

+

The variable scatter plot shows the mean covariate value for the +people with the outcome against the mean covariate value for the people +without the outcome. The color of the dots corresponds to the inclusion +(green) or exclusion in the model (blue), respectively. It is highly +recommended to use the Shiny App since this allows you to hoover over a +covariate to show more details (name, value etc).

+

The plot shows that the mean of most of the covariates is higher for +subjects with the outcome compared to those without.

-Variabel scatter Plot +
+Variabel scatter Plot
Variabel scatter Plot
+
-

## Precision recall

-

Precision (P) is defined as the number of true positives (Tp) over the number of true positives plus the number of false positives (Fp).

+
+

## Precision recall

+

Precision (P) is defined as the number of true positives (Tp) over +the number of true positives plus the number of false positives +(Fp).

-P <- Tp/(Tp + Fp)
-

Recall (R) is defined as the number of true positives (Tp) over the number of true positives plus the number of false negatives (Fn).

+P <- Tp/(Tp + Fp)
+

Recall (R) is defined as the number of true positives (Tp) over the +number of true positives plus the number of false negatives (Fn).

-R <- Tp/(Tp + Fn)
-

These quantities are also related to the (F1) score, which is defined as the harmonic mean of precision and recall.

+R <- Tp/(Tp + Fn)
+

These quantities are also related to the (F1) score, which is defined +as the harmonic mean of precision and recall.

-F1 <- 2 * P * R/(P + R)
-

Note that the precision can either decrease or increase if the threshold is lowered. Lowering the threshold of a classifier may increase the denominator, by increasing the number of results returned. If the threshold was previously set too high, the new results may all be true positives, which will increase precision. If the previous threshold was about right or too low, further lowering the threshold will introduce false positives, decreasing precision.

-

For Recall the denominator does not depend on the classifier threshold (Tp+Fn is a constant). This means that lowering the classifier threshold may increase recall, by increasing the number of true positive results. It is also possible that lowering the threshold may leave recall unchanged, while the precision fluctuates.

+F1 <- 2 * P * R/(P + R)
+

Note that the precision can either decrease or increase if the +threshold is lowered. Lowering the threshold of a classifier may +increase the denominator, by increasing the number of results returned. +If the threshold was previously set too high, the new results may all be +true positives, which will increase precision. If the previous threshold +was about right or too low, further lowering the threshold will +introduce false positives, decreasing precision.

+

For Recall the denominator does not depend on the classifier +threshold (Tp+Fn is a constant). This means that lowering the classifier +threshold may increase recall, by increasing the number of true positive +results. It is also possible that lowering the threshold may leave +recall unchanged, while the precision fluctuates.

-Precision Recall Plot +
+Precision Recall Plot
Precision Recall Plot
+
-

## Demographic summary

-

This plot shows for females and males the expected and observed risk in different age groups together with a confidence area.

-

The results show that our model is well calibrated across gender and age groups.

+
+

## Demographic summary

+

This plot shows for females and males the expected and observed risk +in different age groups together with a confidence area.

+

The results show that our model is well calibrated across gender and +age groups.

-Demographic Summary Plot +
+Demographic Summary Plot
Demographic Summary Plot
+
-

# External validation

-

We recommend to always perform external validation, i.e. apply the final model on as much new datasets as feasible and evaluate its performance.

+
+

# External validation

+

We recommend to always perform external validation, i.e. apply the +final model on as much new datasets as feasible and evaluate its +performance.

-# load the trained model
-plpModel <- loadPlpModel(getwd(),'model')
-
-# add details of new database
-validationDatabaseDetails <- createDatabaseDetails()
-
-# to externally validate the model and perform recalibration run:
-externalValidateDbPlp(
-  plpModel = plpModel,
-  validationDatabaseDetails = validationDatabaseDetails,
-  validationRestrictPlpDataSettings = plpModel$settings$plpDataSettings,
-  settings = createValidationSettings(
-    recalibrate = 'weakRecalibration'
-    ),
-  outputFolder = getwd()
-)
-

This will extract the new plpData from the specified schemas and cohort tables. It will then apply the same population settings and the trained plp model. Finally, it will evaluate the performance and return the standard output as validation$performanceEvaluation and it will also return the prediction on the population as validation$prediction. They can be inserted into the shiny app for viewing the model and validation by running: viewPlp(runPlp=plpResult, validatePlp=validation ).

+# load the trained model +plpModel <- loadPlpModel(getwd(),'model') + +# add details of new database +validationDatabaseDetails <- createDatabaseDetails() + +# to externally validate the model and perform recalibration run: +externalValidateDbPlp( + plpModel = plpModel, + validationDatabaseDetails = validationDatabaseDetails, + validationRestrictPlpDataSettings = plpModel$settings$plpDataSettings, + settings = createValidationSettings( + recalibrate = 'weakRecalibration' + ), + outputFolder = getwd() +)
+

This will extract the new plpData from the specified schemas and +cohort tables. It will then apply the same population settings and the +trained plp model. Finally, it will evaluate the performance and return +the standard output as validation$performanceEvaluation and +it will also return the prediction on the population as +validation$prediction. They can be inserted into the shiny +app for viewing the model and validation by running: +viewPlp(runPlp=plpResult, validatePlp=validation ).

-
-

-Other functionality

-

The package has much more functionality than described in this vignette and contributions have been made my many persons in the OHDSI community. The table below provides an overview:

+
+

Other functionality +

+

The package has much more functionality than described in this +vignette and contributions have been made my many persons in the OHDSI +community. The table below provides an overview:

---+++ @@ -1309,156 +2070,185 @@

- - + + - - + + - - + + - - + + - - + + - - + + - - + +
Functionality
Builing Multiple ModelsThis vignette describes how you can run multiple models automaticallyVignetteThis vignette describes how you can run multiple models +automaticallyVignette
Custom ModelsThis vignette describes how you can add your own custom algorithms in the frameworkVignetteThis vignette describes how you can add your own custom algorithms +in the frameworkVignette
Custom Splitting FunctionsThis vignette describes how you can add your own custom training/validation/testing splitting functions in the frameworkVignetteThis vignette describes how you can add your own custom +training/validation/testing splitting functions in the frameworkVignette
Custom Sampling FunctionsThis vignette describes how you can add your own custom sampling functions in the frameworkVignetteThis vignette describes how you can add your own custom sampling +functions in the frameworkVignette
Custom Feature Engineering/SelectionThis vignette describes how you can add your own custom feature engineering and selection functions in the frameworkVignetteThis vignette describes how you can add your own custom feature +engineering and selection functions in the frameworkVignette
Ensemble modelsThis vignette describes how you can use the framework to build ensemble models, i.e combine multiple models in a super learnerVignetteThis vignette describes how you can use the framework to build +ensemble models, i.e combine multiple models in a super learnerVignette
Learning curvesLearning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.VignetteLearning curves assess the effect of training set size on model +performance by training a sequence of prediction models on successively +larger subsets of the training set. A learning curve plot can also help +in diagnosing a bias or variance problem as explained below.Vignette
-
-

-Demos

-

We have added several demos in the package that run on simulated data:

+
+

Demos +

+

We have added several demos in the package that run on simulated +data:

-# Show all demos in our package: 
-demo(package = "PatientLevelPrediction")
-
-# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call
-demo("SingleModelDemo", package = "PatientLevelPrediction")
+# Show all demos in our package: +demo(package = "PatientLevelPrediction") + +# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call +demo("SingleModelDemo", package = "PatientLevelPrediction")
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Further, PatientLevelPrediction makes extensive use of the Cyclops package.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Further, PatientLevelPrediction makes extensive use of +the Cyclops package.

-citation("Cyclops")
-
## 
-## To cite Cyclops in publications use:
-## 
-## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive
-## parallelization of serial inference algorithms for complex generalized
-## linear models." _ACM Transactions on Modeling and Computer Simulation_,
-## *23*, 10. <URL: https://dl.acm.org/doi/10.1145/2414416.2414791>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {M. A. Suchard and S. E. Simpson and I. Zorych and P. Ryan and D. Madigan},
-##     title = {Massive parallelization of serial inference algorithms for complex generalized linear models},
-##     journal = {ACM Transactions on Modeling and Computer Simulation},
-##     volume = {23},
-##     pages = {10},
-##     year = {2013},
-##     url = {https://dl.acm.org/doi/10.1145/2414416.2414791},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

+citation("Cyclops")
+
## 
+## To cite Cyclops in publications use:
+## 
+##   Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive
+##   parallelization of serial inference algorithms for complex
+##   generalized linear models." _ACM Transactions on Modeling and
+##   Computer Simulation_, *23*, 10.
+##   <https://dl.acm.org/doi/10.1145/2414416.2414791>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {M. A. Suchard and S. E. Simpson and I. Zorych and P. Ryan and D. Madigan},
+##     title = {Massive parallelization of serial inference algorithms for complex generalized linear models},
+##     journal = {ACM Transactions on Modeling and Computer Simulation},
+##     volume = {23},
+##     pages = {10},
+##     year = {2013},
+##     url = {https://dl.acm.org/doi/10.1145/2414416.2414791},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

+

This work is supported in part through the National Science +Foundation grant IIS 1251151.

-
-

-Appendix 1: Study population settings details

-

In the figures below the effect is shown of the removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes booleans on the final study population. We start with a Target Cohort with firstExposureOnly = false and we require a washout period = 1095. We then subset the target cohort based on additional constraints. The final study population in the Venn diagrams below are colored green.

+
+

Appendix 1: Study population settings +details +

+

In the figures below the effect is shown of the +removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes +booleans on the final study population. We start with a Target Cohort +with firstExposureOnly = false and we require a washout period = 1095. +We then subset the target cohort based on additional constraints. The +final study population in the Venn diagrams below are colored green.

  1. Require minimum time-at-risk for all person in the target cohort
    - +
  2. -Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk. +Require minumum time-at-risk for target cohort, except for persons with +outcomes during time-at-risk.
    - +
-) +

)

-Include all persons in the target cohort exclude persons with prior outcomes +Include all persons in the target cohort exclude persons with prior +outcomes
- +
  1. -Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes +Require minimum time-at-risk for target cohort, except for persons with +outcomes during time-at-risk, exclude persons with prior outcomes
    - +
-) +

)

Include all persons in target cohort exclude persons with prior outcomes
- +
    @@ -1467,7 +2257,7 @@

    Include all persons in target cohort
    - +

@@ -1485,11 +2275,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -1498,5 +2290,7 @@

+ + diff --git a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.11/header-attrs.js b/docs/articles/BuildingPredictiveModels_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.7/header-attrs.js b/docs/articles/BuildingPredictiveModels_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/BuildingPredictiveModels_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/ClinicalModels.html b/docs/articles/ClinicalModels.html new file mode 100644 index 000000000..d3cd57891 --- /dev/null +++ b/docs/articles/ClinicalModels.html @@ -0,0 +1,273 @@ + + + + + + + +Clinical Models • PatientLevelPrediction + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + + +
+

Clinical models developed using the OHDSI PatientLevelPrediction +framework +

+ ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TitleLink
Using Machine Learning Applied to Real-World Healthcare Data for +Predictive Analytics: An Applied Example in Bariatric SurgeryValue +in Health
Development and validation of a prognostic model predicting +symptomatic hemorrhagic transformation in acute ischemic stroke at scale +in the OHDSI networkPLoS +One
Wisdom of the CROUD: development and validation of a patient-level +prediction model for opioid use disorder using population-level claims +dataPLoS +One
Developing predictive models to determine Patients in End-of-life +Care in Administrative datasetsDrug +Safety
Predictors of diagnostic transition from major depressive disorder +to bipolar disorder: a retrospective observational network studyTranslational +psychiatry
Seek COVER: using a disease proxy to rapidly develop and validate a +personalized risk calculator for COVID-19 outcomes in an international +networkBMC +Medical Research Methodology
90-Day all-cause mortality can be predicted following a total knee +replacement: an international, network study to develop and validate a +prediction modelKnee +Surgery, Sports Traumatology, Arthroscopy
Machine learning and real-world data to predict lung cancer risk in +routine careCancer +Epidemiology, Biomarkers & Prevention
Development and validation of a patient-level model to predict +dementia across a network of observational databasesBMC +medicine
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.7.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/ConstrainedPredictors.html b/docs/articles/ConstrainedPredictors.html new file mode 100644 index 000000000..24fa22693 --- /dev/null +++ b/docs/articles/ConstrainedPredictors.html @@ -0,0 +1,561 @@ + + + + + + + +Constrained predictors • PatientLevelPrediction + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + + +
+

Constrained Predictors +

+
+

How to use the PhenotypeLibrary R package +

+

Here we provide a set of phenotypes that can be used as predictors in +prediction models or best practice research.

+

These phenotypes can be extracted from the PhenotypeLibrary R +package. To install the R package run:

+
+remotes::install_github('ohdsi/PhenotypeLibrary')
+

To extract the cohort definition for Alcoholism with an id of 1165, +just run:

+
+PhenotypeLibrary::getPlCohortDefinitionSet(1165)
+

in general you can extract all the cohorts by running:

+
+phenotypeDefinitions <- PhenotypeLibrary::getPlCohortDefinitionSet(1152:1215)
+
+
+

The full set of predictor phenotypes +

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Phenotype NameDisorder classificationOHDSI Phenotype library ID
AlcoholismBehavioral1165
SmokingBehavioral1166
AnemiaBlood1188
OsteoarthritisBone1184
OsteoporosisBone1185
CancerCancer1215
Atrial fibrillationCardiovascular1160
Congestive heart failureCardiovascular1154
Coronary artery diseaseCardiovascular1162
Heart valve disorderCardiovascular1172
HyperlipidemiaCardiovascular1170
HypertensionCardiovascular1198
AnginaCardiovascular1159
Skin UlcerDebility1168
Diabetes type 1Endocrine1193
Diabetes type 2Endocrine1194
HypothyroidismEndocrine1171
ObesityEndocrine1179
Gastroesophageal reflux disease (GERD)GI1178
Gastrointestinal (GI) bleedGI1197
Inflammatory bowel disorder (IBD)GI/Rheumatology1180
Hormonal contraceptivesGynecologic1190
Antibiotics AminoglycosidesInfection1201
Antibiotics CarbapenemsInfection1202
Antibiotics CephalosporinsInfection1203
Antibiotics FluoroquinolonesInfection1204
Antibiotics Glycopeptides and lipoglycopeptidesInfection1205
Antibiotics MacrolidesInfection1206
Antibiotics MonobactamsInfection1207
Antibiotics OxazolidinonesInfection1208
Antibiotics PenicillinsInfection1209
Antibiotics PolypeptidesInfection1210
Antibiotics RifamycinsInfection1211
Antibiotics SulfonamidesInfection1212
Antibiotics StreptograminsInfection1213
Antibiotics TetracyclinesInfection1214
PneumoniaInfection/Respiratory1199
SepsisInfection1176
Urinary tract infection (UTI)Infection1186
HepatitisLiver1169
AnxietyMood1189
Depression (MDD)Mood1161
Psychotic disorderMood1175
Antiepileptics (pain)Neurology/Pain1183
SeizureNeurology1153
Hemorrhagic strokeNeurology/Vascular1156
Non-hemorrhagic strokeNeurology/Vascular1155
Acetaminophen prescriptionPain/Infection1187
Low back painPain1173
NeuropathyPain/Neurology1174
OpioidsPain1182
Acute kidney injuryKidney1163
Chronic kidney diseaseKidney1191
AsthmaRespiratory1164
Chronic obstructive pulmonary disorder (COPD)Respiratory1192
DyspneaRespiratory1195
Respiratory failureRespiratory1177
Sleep apneaRespiratory1167
Rheumatoid arthritisRheumatology1200
SteroidsRheumatology/Pain/Pulmonary1181
Peripheral vascular diseaseVascular1157
AspirinVascular1158
Deep vein thrombosis (DVT)Vascular1152
EdemaVascular1196
Inpatient visitNANA
+
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.7.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/CreatingLearningCurves.html b/docs/articles/CreatingLearningCurves.html index 739749046..2f44ca17f 100644 --- a/docs/articles/CreatingLearningCurves.html +++ b/docs/articles/CreatingLearningCurves.html @@ -19,6 +19,8 @@ + +
+
@@ -146,162 +161,243 @@

2022-03-09

-
-

-Introduction

-

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package to create learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette.

-

Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, best-practice is to partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set.

-

Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.

-
-

Learning curve example.

+
+

Introduction +

+

This vignette describes how you can use the Observational Health Data +Sciences and Informatics (OHDSI) PatientLevelPrediction +package to create learning curves. This vignette assumes you have read +and are comfortable with building patient level prediction models as +described in the BuildingPredictiveModels +vignette.

+

Prediction models will show overly-optimistic performance when +predicting on the same data as used for training. Therefore, +best-practice is to partition our data into a training set and testing +set. We then train our prediction model on the training set portion and +asses its ability to generalize to unseen data by measuring its +performance on the testing set.

+

Learning curves assess the effect of training set size on model +performance by training a sequence of prediction models on successively +larger subsets of the training set. A learning curve plot can also help +in diagnosing a bias or variance problem as explained below.

+
+Learning curve example.
Learning curve example.
-

Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance.

-

The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.

-
-

Prediction model suffering from high variance.

+

Figure 1, shows an example of learning curve plot in which the +vertical axis represents the model performance and the horizontal axis +the training set size. If training set size is small, the performance on +the training set is high, because a model can often be fitted well to a +limited number of training examples. At the same time, the performance +on the testing set will be poor, because the model trained on such a +limited number of training examples will not generalize well to unseen +data in the testing set. As the training set size increases, the +performance of the model on the training set will decrease. It becomes +more difficult for the model to find a good fit through all the training +examples. Also, the model will be trained on a more representative +portion of training examples, making it generalize better to unseen +data. This can be observed by the increasin testing set performance.

+

The learning curve can help us in diagnosing bias and variance +problems with our classifier which will provide guidance on how to +further improve our model. We can observe high variance (overfitting) in +a prediction model if it performs well on the training set, but poorly +on the testing set (Figure 2). Adding additional data is a common +approach to counteract high variance. From the learning curve it becomes +apparent, that adding additional data may improve performance on the +testing set a little further, as the learning curve has not yet +plateaued and, thus, the model is not saturated yet. Therefore, adding +more data will decrease the gap between training set and testing set, +which is the main indicator for a high variance problem.

+
+Prediction model suffering from high variance.
Prediction model suffering from high +variance.
-

Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (for example non-linear) relationships in the data may be an alternative approach to consider in this high bias situation.

-
-

Prediction model suffering from high bias.

+

Furthermore, we can observe high bias (underfitting) if a prediction +model performs poorly on the training set as well as on the testing set +(Figure 3). The learning curves of training set and testing set have +flattened on a low performance with only a small gap in between them. +Adding additional data will in this case have little to no impact on the +model performance. Choosing another prediction algorithm that can find +more complex (for example non-linear) relationships in the data may be +an alternative approach to consider in this high bias situation.

+
+Prediction model suffering from high bias.
Prediction model suffering from high bias.
-
-

-Creating the learning curve

-

Use the PatientLevelPrediction package to create a plpData object . Alternatively, you can make use of the data simulator. The following code snippet creates data for 12000 patients.

+
+

Creating the learning curve +

+

Use the PatientLevelPrediction +package to create a plpData object . Alternatively, you can +make use of the data simulator. The following code snippet creates data +for 12000 patients.

-set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-

Specify the population settings (this does additional exclusions such as requiring minimum prior observation or no prior outcome as well as specifying the time-at-risk period to enable labels to be created):

+set.seed(1234) +data(plpDataSimulationProfile) +sampleSize <- 12000 +plpData <- simulatePlpData( + plpDataSimulationProfile, + n = sampleSize +)
+

Specify the population settings (this does additional exclusions such +as requiring minimum prior observation or no prior outcome as well as +specifying the time-at-risk period to enable labels to be created):

-populationSettings <- createStudyPopulationSettings(
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  riskWindowEnd = 365,
-  verbosity = "INFO"
-)
+populationSettings <- createStudyPopulationSettings( + binary = TRUE, + firstExposureOnly = FALSE, + washoutPeriod = 0, + removeSubjectsWithPriorOutcome = FALSE, + priorOutcomeLookback = 99999, + requireTimeAtRisk = FALSE, + minTimeAtRisk = 0, + riskWindowStart = 0, + riskWindowEnd = 365, + verbosity = "INFO" +)

Specify the prediction algorithm to be used.

-# Use LASSO logistic regression
-modelSettings <- setLassoLogisticRegression()
-

Specify the split settings and a sequence of training set fractions (these over ride the splitSetting trainFraction). Alternatively, instead of trainFractions, you can provide a sequence of training events (trainEvents) instead of the training set fractions. This is recommended, because our research has shown that number of events is the important determinant of model performance. Make sure that your training set contains the number of events specified.

+# Use LASSO logistic regression +modelSettings <- setLassoLogisticRegression()
+

Specify the split settings and a sequence of training set fractions +(these over ride the splitSetting trainFraction). Alternatively, instead +of trainFractions, you can provide a sequence of training +events (trainEvents) instead of the training set fractions. +This is recommended, because our research has shown that number of +events is the important determinant of model performance. Make sure that +your training set contains the number of events specified.

-splitSettings = createDefaultSplitSetting(
-  testFraction = 0.2,  
-  type = 'stratified',
-  splitSeed = 1000
-  )
-
-trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions
-
-# alternatively use a sequence of training events by uncommenting the line below.
-# trainEvents <- seq(100, 5000, 100)
+splitSettings = createDefaultSplitSetting( + testFraction = 0.2, + type = 'stratified', + splitSeed = 1000 + ) + +trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions + +# alternatively use a sequence of training events by uncommenting the line below. +# trainEvents <- seq(100, 5000, 100)

Create the learning curve object.

-learningCurve <- createLearningCurve(
-  plpData = plpData,
-  outcomeId = 2,  
-  parallel = T,
-  cores = 4,
-  modelSettings = modelSettings,
-  saveDirectory = getwd(),
-  analysisId = 'learningCurve',
-  populationSettings = populationSettings,
-  splitSettings = splitSettings,
-  trainFractions = trainFractions,
-  trainEvents = NULL,
-  preprocessSettings = createPreprocessSettings(
-    minFraction = 0.001,
-    normalize = T
-  ),
-  executeSettings = createExecuteSettings(
-    runSplitData = T, 
-    runSampleData = F,
-    runfeatureEngineering = F,
-    runPreprocessData = T,
-    runModelDevelopment = T,
-    runCovariateSummary = F
-    )
-)
-

Plot the learning curve object (Figure 4). Specify one of the available metrics: AUROC, AUPRC, sBrier. Moreover, you can specify what metric to put on the abscissa, number of observations or number of events. We recommend the latter, because events are determinant of model performance and allow you to better compare learning curves across different prediction problems and databases.

+learningCurve <- createLearningCurve( + plpData = plpData, + outcomeId = 2, + parallel = T, + cores = 4, + modelSettings = modelSettings, + saveDirectory = getwd(), + analysisId = 'learningCurve', + populationSettings = populationSettings, + splitSettings = splitSettings, + trainFractions = trainFractions, + trainEvents = NULL, + preprocessSettings = createPreprocessSettings( + minFraction = 0.001, + normalize = T + ), + executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ) +)
+

Plot the learning curve object (Figure 4). Specify one of the +available metrics: AUROC, AUPRC, +sBrier. Moreover, you can specify what metric to put on the +abscissa, number of observations or number of +events. We recommend the latter, because +events are determinant of model performance and allow you +to better compare learning curves across different prediction problems +and databases.

-plotLearningCurve(
-  learningCurve,
-  metric = 'AUROC',
-  abscissa = 'events',
-  plotTitle = 'Learning Curve',
-  plotSubtitle = 'AUROC performance'
-)
-
-

Learning curve plot.

+plotLearningCurve( + learningCurve, + metric = 'AUROC', + abscissa = 'events', + plotTitle = 'Learning Curve', + plotSubtitle = 'AUROC performance' +)
+
+Learning curve plot.
Learning curve plot.
-
-

-Parallel processing

-

The learning curve object can be created in parallel, which can reduce computation time significantly. Whether to run the code in parallel or not is specified using the parallel input. Currently this functionality is only available for LASSO logistic regression and gradient boosting machines. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis.

-

When running in parrallel, R will find the number of available processing cores automatically and register the required parallel backend. Alternatively, you can provide the number of cores you wish to use via the cores input.

+
+

Parallel processing +

+

The learning curve object can be created in parallel, which can +reduce computation time significantly. Whether to run the code in +parallel or not is specified using the parallel input. +Currently this functionality is only available for LASSO logistic +regression and gradient boosting machines. Depending on the number of +parallel workers it may require a significant amount of memory. We +advise to use the parallelized learning curve function for parameter +search and exploratory data analysis.

+

When running in parrallel, R will find the number of available +processing cores automatically and register the required parallel +backend. Alternatively, you can provide the number of cores you wish to +use via the cores input.

-
-

-Demo

+
+

Demo +

We have added a demo of the learningcurve:

-# Show all demos in our package: 
- demo(package = "PatientLevelPrediction")
-
-# Run the learning curve
- demo("LearningCurveDemo", package = "PatientLevelPrediction")
-

Do note that running this demo can take a considerable amount of time (15 min on Quad core running in parallel)!

+# Show all demos in our package: + demo(package = "PatientLevelPrediction") + +# Run the learning curve + demo("LearningCurveDemo", package = "PatientLevelPrediction")
+

Do note that running this demo can take a considerable amount of time +(15 min on Quad core running in parallel)!

-
-

-Publication

-

A publication titled ‘How little data do we need for patient-level prediction?’ uses the learning curve functionality in this package and can be accessed as preprint in the arXiv archives at https://arxiv.org/abs/2008.07361.

+
+

Publication +

+

A publication titled ‘How little data do we need for patient-level +prediction?’ uses the learning curve functionality in this package and +can be accessed as preprint in the arXiv archives at https://arxiv.org/abs/2008.07361.

-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

@@ -316,11 +412,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -329,5 +427,7 @@

+ + diff --git a/docs/articles/CreatingLearningCurves_files/header-attrs-2.11/header-attrs.js b/docs/articles/CreatingLearningCurves_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingLearningCurves_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingLearningCurves_files/header-attrs-2.7/header-attrs.js b/docs/articles/CreatingLearningCurves_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingLearningCurves_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingNetworkStudies.html b/docs/articles/CreatingNetworkStudies.html index c5febb13c..bcf3648fa 100644 --- a/docs/articles/CreatingNetworkStudies.html +++ b/docs/articles/CreatingNetworkStudies.html @@ -19,6 +19,8 @@ + +
+
@@ -146,68 +162,106 @@

2022-03-09

-
-

-Introduction

-

The OHDSI Patient Level Prediction (PLP) package provides the framework to implement prediction models at scale. This can range from developing a large number of models across sites (methodology and study design insight) to extensive external validation of existing models in the OHDSI PLP framework (model insight). This vignette describes how you can use the PatientLevelPrediction package to create a network study package.

+
+

Introduction +

+

The OHDSI Patient Level Prediction (PLP) package provides the +framework to implement prediction models at scale. This can range from +developing a large number of models across sites (methodology and study +design insight) to extensive external validation of existing models in +the OHDSI PLP framework (model insight). This vignette describes how you +can use the PatientLevelPrediction package to create a +network study package.

-
-

-Useful publication

-

The open access publication A standardized analytics pipeline for reliable and rapid development and validation of prediction models using observational health data details the process used to develop and validate prediction models using the OHDSI prediction framework and tools. This publication describes each of the steps and then demonstrates these by focusing on predicting death in those who have covid-19.

+
+

Useful publication +

+

The open access publication A standardized +analytics pipeline for reliable and rapid development and validation of +prediction models using observational health data details the +process used to develop and validate prediction models using the OHDSI +prediction framework and tools. This publication describes each of the +steps and then demonstrates these by focusing on predicting death in +those who have covid-19.

-
-

-Main steps for running a network study

-
-

-Step 1 – developing the study

+
+

Main steps for running a network study +

+
+

Step 1 – developing the study +

    -
  • Design the study: target/outcome cohort logic, concept sets for medical definitions, settings for developing new model or validation of adding existing models to framework. Suggestion: look in literature for validated definitions.
  • -
  • Write a protocol that motivates the study and provides full details (sufficient for people to replicate the study in the future).
  • -
  • Write an R package for implementing the study across diverse computational environments [see guidance below for structure of package and use the skeleton github package here: https://github.com/OHDSI/SkeletonPredictionStudy ]
  • +
  • Design the study: target/outcome cohort logic, concept sets for +medical definitions, settings for developing new model or validation of +adding existing models to framework. Suggestion: look in literature for +validated definitions.
  • +
  • Write a protocol that motivates the study and provides full details +(sufficient for people to replicate the study in the future).
  • +
  • Write an R package for implementing the study across diverse +computational environments [see guidance below for structure of package +and use the skeleton github package here: https://github.com/OHDSI/SkeletonPredictionStudy ]
-
-

-Step 2 – implementing the study part 1

+
+

Step 2 – implementing the study part 1 +

    -
  • Get contributors to install the package and dependencies. Ensure the package is installed correctly for each contributor by asking them to run the checkInstall functions (as specified in the InstallationGuide).
  • -
  • Get contributors to run the createCohort function to inspect the target/outcome definitions. If the definitions are not suitable for a site, go back to step 1 and revise the cohort definitions.
  • +
  • Get contributors to install the package and dependencies. Ensure the +package is installed correctly for each contributor by asking them to +run the checkInstall functions (as specified in the +InstallationGuide).
  • +
  • Get contributors to run the createCohort function to inspect the +target/outcome definitions. If the definitions are not suitable for a +site, go back to step 1 and revise the cohort definitions.
-
-

-Step 3 – implementing the study part 2 (make sure the package is functioning as planned and the definitions are valid across sites)

+
+

Step 3 – implementing the study part 2 (make sure the package is +functioning as planned and the definitions are valid across sites) +

    -
  • Get contributors to run the main.R with the settings configured to their environment
  • +
  • Get contributors to run the main.R with the settings configured to +their environment
  • Get the contributors to submit the results
-
-

-Step 4 – Publication

-

The study creator has the first option to be first author, if he/she does not wish to be first author then he/she can pick the most suitable person from the contributors. All contributors will be listed as authors on the paper. The last author will be the person who lead/managed the study, if this was the first author then the first author can pick the most suitable last author. All authors between the first and last author will be alphabetical by last name.

+
+

Step 4 – Publication +

+

The study creator has the first option to be first author, if he/she +does not wish to be first author then he/she can pick the most suitable +person from the contributors. All contributors will be listed as authors +on the paper. The last author will be the person who lead/managed the +study, if this was the first author then the first author can pick the +most suitable last author. All authors between the first and last author +will be alphabetical by last name.

-
-

-Package Skeleton - File Structure

+
+

Package Skeleton - File Structure +

    -
  • DESCRIPTION: This file describes the R package and the dependencies
  • +
  • DESCRIPTION: This file describes the R package and the +dependencies
  • NAMESPACE: This file is created automatically by Roxygen
  • -
  • Readme.md: This file should provide the step by step guidance on implementing the package
  • +
  • Readme.md: This file should provide the step by step guidance on +implementing the package
  • R
  • -
  • helpers.r: all the custom functions used by the package should be in this file (e.g., checkInstall)
  • -
  • main.r: this file will call the functions in helpers.r to execute the full study
  • -
  • submit.r: this file will be called at the end to submit the compressed folder to the study creator/manager.
  • -
  • Man: this folder will contain the documentation for the functions in helpers.r (this should be automatically generated by roxygen)
  • +
  • helpers.r: all the custom functions used by the package should be in +this file (e.g., checkInstall)
  • +
  • main.r: this file will call the functions in helpers.r to execute +the full study
  • +
  • submit.r: this file will be called at the end to submit the +compressed folder to the study creator/manager.
  • +
  • Man: this folder will contain the documentation for the functions in +helpers.r (this should be automatically generated by roxygen)
  • Inst
  • -
  • sql/sql_sever * targetCohort: the target cohort parameterised sql code * outcomeCohort: the outcome cohort parameterised sql code
  • +
  • sql/sql_sever * targetCohort: the target cohort parameterised sql +code * outcomeCohort: the outcome cohort parameterised sql code
  • plp_models: place any PLP models here
  • Extras
@@ -225,11 +279,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -238,5 +294,7 @@

+ + diff --git a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.11/header-attrs.js b/docs/articles/CreatingNetworkStudies_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.7/header-attrs.js b/docs/articles/CreatingNetworkStudies_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingNetworkStudies_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingShinyApp.html b/docs/articles/CreatingShinyApp.html deleted file mode 100644 index 838fedfa8..000000000 --- a/docs/articles/CreatingShinyApp.html +++ /dev/null @@ -1,456 +0,0 @@ - - - - - - - -Creating Shiny App • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Introduction

-

In this vignette we will show with example code how to create a shiny app and add the shiny app online for other researcher around the whole to explore.

-

There are two ways to create the shiny app: 1) Using the atlas R generated prediction package 2) Manually using the PatientLevelPrediction functions in a script

-

We assume you have experience with using the OHDSI PatientLevelPrediction package to develop and externally validate prediction models using data in the OMOP CDM. If you do not have experience with this then please first read our general vignette BuildingPredictiveModels vignette.

-
-
-

-Atlas Development Shiny App

-
-

-Step 1: Run the model development package to get results

-

To create a shiny app project via the Atlas auto-generated prediction R package you named ‘myPackage’ you need to run the execute function:

-
-library(myPackage)
-myPackage::execute(connectionDetails = connectionDetails,
-        cdmDatabaseSchema = 'myDatabaseSchema.dbo',
-        cdmDatabaseName = 'MyDatabase',
-        cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results',
-        cohortTable = 'cohort',
-        outputFolder = 'C:/myResults',
-        createProtocol = F,
-        createCohorts = F,
-        runAnalyses = T,
-        createResultsDoc = F,
-        packageResults = F,
-        createValidationPackage = F, 
-        minCellCount= 5,
-        createShiny = F,
-        createJournalDocument = F,
-        analysisIdDocument = 1)
-

This will extract data based on the settings you supplied in the Atlas prediction design from cohort tables already generated in your CDM database schema. The PatientLevelPrediction framework will then run and develop/evaluate models saving the results to the location specified by outputFolder (e.g., ‘C:/myResults’).

-
-
-

-Step 2: Create the shiny app

-

To create a shiny app project with these results you can then simply run:

-
-myPackage::execute(connectionDetails = connectionDetails,
-        cdmDatabaseSchema = 'myDatabaseSchema.dbo',
-        cdmDatabaseName = 'MyDatabase',
-        cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results',
-        cohortTable = 'cohort',
-        outputFolder = 'C:/myResults',
-        minCellCount= 5,
-        createShiny = T)
-

making sure the outputFolder is the same location used when you ran the analysis. This code populates a shiny app project with the results but removes sensitive date such as connection settings, the cdmDatabaseSchema setting, the predicton matrix and any sensitive counts less than ‘minCellCount’ from the covariate summary and performance evalaution.

-

The shiny app project populated with the model development results can then be found at ‘[outputFolder]/ShinyApp’ e.g., ‘C:/myResults/ShinyApp’.

- -
-
-

-Step 3: Sharing the shiny app

-

Once you are happy with your app, you can publish it onto https://data.ohdsi.org by adding the folder ‘ShinyApp’ to the OHDSI githib ShinyDeploy (https://github.com/OHDSI/ShinyDeploy/). Continuing the example, we would copy the folder ‘[outputFolder]/ShinyApp’ and paste it to the local github clone of ShinyDeploy. We recommend renaming the folder from ‘ShinyApp’ to a name that describes your prediction, e.g., ‘StrokeinAF’. Then commit the changes and make a pull request to ShinyDeploy. Once accepted your shiny app will be viewable at ‘https://data.ohdsi.org’. If you commited the folder named ‘StrokeInAF’ then the shiny app will be hosted at ‘https://data.ohdsi.org/StrokeInAF’.

-
-
-
-

-Atlas External Validation

-

To include external validation results you can use the Atlas generated R study package to create the external validation package:

-
-myPackage::execute(connectionDetails = connectionDetails,
-        cdmDatabaseSchema = 'myDatabaseSchema.dbo',
-        cdmDatabaseName = 'MyDatabase',
-        cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results',
-        cohortTable = 'cohort',
-        outputFolder = 'C:/myResults',
-        createValidationPackage = T)
-

This will create a new R package inside the ‘outputFolder’ location with the word ‘Validation’ appended the name of your development package. For example, if your ‘outputFolder’ was ‘C:/myResults’ and your development package was named ‘myPackage’ then the validation package will be found at: ‘C:/myResults/myPackageValidation’. When running the valdiation package make sure to set the ‘outputFolder’ to the Validation folder within your model development outputFolder location:

-
-myPackageValidation::execute(connectionDetails = connectionDetails,
-                 databaseName = databaseName,
-                 cdmDatabaseSchema = cdmDatabaseSchema,
-                 cohortDatabaseSchema = cohortDatabaseSchema,
-                 oracleTempSchema = oracleTempSchema,
-                 cohortTable = cohortTable,
-                 outputFolder = 'C:/myResults/Validation',
-                 createCohorts = T,
-                 runValidation = T,
-                 packageResults = F,
-                 minCellCount = 5,
-                 sampleSize = NULL)
-

Now you can rerun Steps 2-3 to populate the shiny app project that will also include the validation results (as long as the validation results are in the Validation folder found in the Step 1 outputFolder location e.g., in ‘C:/myResults/Validation’).

-
-
-

-Combining multiple atlas results into one shiny app:

-

The code below can be used to combine multiple Atlas packages’ results into one shiny app:

-
-populateMultipleShinyApp <- function(shinyDirectory,
-                             resultDirectory,
-                             minCellCount = 10,
-                             databaseName = 'sharable name of development data'){
-  
-  #check inputs
-  if(missing(shinyDirectory)){
-    shinyDirectory <- system.file("shiny", "PLPViewer", package = "SkeletonPredictionStudy")
-  }
-  if(missing(resultDirectory)){
-    stop('Need to enter the resultDirectory')
-  }
-  
-
-    for(i in 1:length(resultDirectory)){
-      if(!dir.exists(resultDirectory[i])){
-        stop(paste('resultDirectory ',i,' does not exist'))
-      }
-    }
-  
-  outputDirectory <- file.path(shinyDirectory,'data')
-  
-  # create the shiny data folder
-  if(!dir.exists(outputDirectory)){
-    dir.create(outputDirectory, recursive = T)
-  }
-  
-  
-  # need to edit settings ...
-  files <- c()
-  for(i in 1:length(resultDirectory)){
-  # copy the settings csv
-  file <- utils::read.csv(file.path(resultDirectory[i],'settings.csv'))
-  file$analysisId <- 1000*as.double(file$analysisId)+i
-  files <- rbind(files, file)
-  }
-  utils::write.csv(files, file.path(outputDirectory,'settings.csv'), row.names = F)
-  
-  for(i in 1:length(resultDirectory)){
-  # copy each analysis as a rds file and copy the log
-  files <- dir(resultDirectory[i], full.names = F)
-  files <- files[grep('Analysis', files)]
-  for(file in files){
-    
-    if(!dir.exists(file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)))){
-      dir.create(file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)))
-    }
-    
-    if(dir.exists(file.path(resultDirectory[i],file, 'plpResult'))){
-      res <- PatientLevelPrediction::loadPlpResult(file.path(resultDirectory[i],file, 'plpResult'))
-      res <- PatientLevelPrediction::transportPlp(res, n= minCellCount, 
-                                                  save = F, dataName = databaseName[i])
-      
-      res$covariateSummary <- res$covariateSummary[res$covariateSummary$covariateValue!=0,]
-      covSet <- res$model$metaData$call$covariateSettings
-      res$model$metaData <- NULL
-      res$model$metaData$call$covariateSettings <- covSet
-      res$model$predict <- NULL
-      if(!is.null(res$performanceEvaluation$evaluationStatistics)){
-      res$performanceEvaluation$evaluationStatistics[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      } else{
-        writeLines(paste0(resultDirectory[i],file, '-ev'))
-      }
-      if(!is.null(res$performanceEvaluation$thresholdSummary)){
-      res$performanceEvaluation$thresholdSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      }else{
-        writeLines(paste0(resultDirectory[i],file, '-thres'))
-      }
-      if(!is.null(res$performanceEvaluation$demographicSummary)){
-      res$performanceEvaluation$demographicSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      } else{
-        writeLines(paste0(resultDirectory[i],file, '-dem'))
-      }
-      if(!is.null(res$performanceEvaluation$calibrationSummary)){
-      res$performanceEvaluation$calibrationSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      }else{
-        writeLines(paste0(resultDirectory[i],file, '-cal'))
-      }
-      if(!is.null(res$performanceEvaluation$predictionDistribution)){
-      res$performanceEvaluation$predictionDistribution[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)
-      }else{
-        writeLines(paste0(resultDirectory[i],file, '-dist'))
-      }
-      saveRDS(res, file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i), 'plpResult.rds'))
-    }
-    if(file.exists(file.path(resultDirectory[i],file, 'plpLog.txt'))){
-      file.copy(from = file.path(resultDirectory[i],file, 'plpLog.txt'), 
-                to = file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i), 'plpLog.txt'))
-    }
-  }
-  }
-  
-  
-  
-  for(i in 1:length(resultDirectory)){
-  # copy any validation results
-  if(dir.exists(file.path(resultDirectory[i],'Validation'))){
-    valFolders <-  dir(file.path(resultDirectory[i],'Validation'), full.names = F)
-    
-    if(length(valFolders)>0){
-      # move each of the validation rds
-      for(valFolder in valFolders){
-        
-        # get the analysisIds
-        valSubfolders <- dir(file.path(resultDirectory[i],'Validation',valFolder), full.names = F)
-        if(length(valSubfolders)!=0){
-          for(valSubfolder in valSubfolders ){
-            valSubfolderUpdate <- paste0('Analysis_', as.double(gsub('Analysis_','', valSubfolder))*1000+i)
-            valOut <- file.path(valFolder,valSubfolderUpdate)
-            valOutOld <- file.path(valFolder,valSubfolder)
-            if(!dir.exists(file.path(outputDirectory,'Validation',valOut))){
-              dir.create(file.path(outputDirectory,'Validation',valOut), recursive = T)
-            }
-            
-            
-            if(file.exists(file.path(resultDirectory[i],'Validation',valOutOld, 'validationResult.rds'))){
-              res <- readRDS(file.path(resultDirectory[i],'Validation',valOutOld, 'validationResult.rds'))
-              res <- PatientLevelPrediction::transportPlp(res, n= minCellCount, 
-                                                          save = F, dataName = databaseName[i])
-              res$covariateSummary <- res$covariateSummary[res$covariateSummary$covariateValue!=0,]
-              saveRDS(res, file.path(outputDirectory,'Validation',valOut, 'validationResult.rds'))
-            }
-          }
-        }
-        
-      }
-      
-    }
-    
-  }
-  }
-  
-  return(outputDirectory)
-  
-}
-
-

-Example code to combine multiple results

-

The following code will combine the results found in ‘C:/myResults’, ‘C:/myResults2’ and ‘C:/myResults3’ into the shiny project at ‘C:/R/library/myPackage/shiny/PLPViewer’:

-
-populateMultipleShinyApp(shinyDirectory = 'C:/R/library/myPackage/shiny/PLPViewer',
-                                     resultDirectory = c('C:/myResults',
-                                                         'C:/myResults2',
-                                                         'C:/myResults3'),
-                                     minCellCount = 0,
-                                     databaseName = c('database1','database2','database3'))
-
-
-
-

-Manual App Creation

-

[instructions coming soon]

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - -
- -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - diff --git a/docs/articles/CreatingShinyApp_files/header-attrs-2.11/header-attrs.js b/docs/articles/CreatingShinyApp_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingShinyApp_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CreatingShinyApp_files/header-attrs-2.7/header-attrs.js b/docs/articles/CreatingShinyApp_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/CreatingShinyApp_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/CustomPredictionAlgorithms.html b/docs/articles/CustomPredictionAlgorithms.html deleted file mode 100644 index c38730513..000000000 --- a/docs/articles/CustomPredictionAlgorithms.html +++ /dev/null @@ -1,463 +0,0 @@ - - - - - - - -Custom patient-level prediction algorithms • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - - -
-

-Introduction

-

The PatientLevelPrediction package enables data extraction, model building, and model evaluation using data from databases that are translated into the Observational Medical Outcomes Partnership Common Data Model (OMOP CDM).

-

This vignette describes how you can add custom algorithms to the list of available algorithms in the PatientLevelPrediction package. This would allow you to fully leverage the OHDSI PatientLevelPrediction model development process with your own favourite algorithm.

-

Of course, we invite you to share your new algorithms with the community through the GitHub repository.

-
-
-

-General Structure

-

To add a custom classifier to the package you need to add the set, the fit functions into a R file with the model name. You also need to ensure there is a corresponding predict function in predict.R. For example, if you were to make a made up model, then in MadeUp.R you would add the following models:

-
-

-set

-

The setNewModel is a function that takes as input the different hyper-parameter values to do a grid search when training the model. The output of the model a list as class ‘modelSettings’ containing: + param - all the combinations of the hyper-parameter values input + model - a string specifying what function to call to fit the model + name - a string containing the name of the model.
-For example, if you were adding a model call madeUp that had two hyper-parameters then the set function would be:

-
setMadeUp <- function(a=1, b=2, seed=NULL){
-  # add input checks here...
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', # this will be calle to train the made up model
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-

-fit

-

fitNewModel this function takes as input: * population - the study popualation the model is being developed on * plpData - the plpData object * param - the hyper-parameters as a list of all combinations * quiet - T or F indicating whether to output progress * outcomeId - the outcome id * cohortId - the target population id

-

then it trains a model for each param entry, picks the best param entry and trains a final model for that setting. The fit function returns a list of class plpModel with the following objects: * model - a trained model * modelSettings - a list containing the model and input param * trainCVAac - a value with the train AUC value * hyperParamSearch - a dataframe with the hyperparameter grid and corresponding AUCs * metaData - the metaData from the plpData object * populationSettings - the settings used to create the population and define the time-at-risk * outcomeId - the outcomeId being predicted * cohortId - the cohortId corresponding to the target cohort * varImp - a dataframe with the covaraites and a measure of importance * trainingTime - how long it took to develop/evaluate the model * covariateMap - if the plpData are converted to a matrix for model compatibility this tells us what covariate each row in the matrix correpsonds to and is need when implementing the model on new data The plpModle returned by fit also has a type attribute, this points to the predict function, for example attr(result, 'type') <- 'madeup' means when the model is applied to new data, the ‘predict.madeup’ function in Predict.R is called. if this doesnt exist, then the model will fail. Another attribute is the predictionType attr(result, 'predictionType') <- 'binary' this is currently not needed but may be important in the future when we expand to regression or multiclass classification.

-

The fit shell is:

-
fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # **************** code to train the model here
-  # trainedModel <- this code should apply each hyper-param using the cross validation
-  #                 then pick out the best hyper-param setting
-  #                 and finally fit a model on the whole train data using the 
-  #                 optimal hyper-param settings
-  # ****************
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-

You can wish to make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function. As the end of the fit function specified attr(result, 'type') <- 'madeup' we also need to make sure there is a predict.madeup function in Predict.R:

-
-
-

-predict

-

The prediction function takes as input the plpModel returned by fit, a population and corresponding plpData. It returns a data.frame with the columns: * rowId - the id for each person in the population * value - the predicted risk from the plpModel If the population contains the columns outcomeCount and indexes, then these are also output.

-
predict.madeup <- function(plpModel, population, plpData, ...) {
-    
-    # ************* code to do prediction for each rowId in population
-    # prediction <- code to do prediction here returning columns: rowId and
-    # value (predicted risk) **************
-    
-    prediction <- merge(population, prediction, by = "rowId")
-    prediction <- prediction[, colnames(prediction) %in% c("rowId", "outcomeCount", 
-        "indexes", "value")]
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    return(prediction)
-    
-}
-
-
-
-

-R Model Example

-
-

-set

-
setMadeUp <- function(a=1, b=2, seed=NULL){
-  # check a is valid positive value
-  if(missing(a)){
-    stop('a must be input')
-  }
-  if(!class(a)%in%c('numeric','integer'){
-    stop('a must be numeric')
-  }
-  if(a < 0){
-    stop('a must be positive')
-  }
-  # check b is numeric
-  if(missing(b)){
-    stop('b must be input')
-  }
-  if(!class(b)%in%c('numeric','integer'){
-    stop('b must be numeric')
-  }
-  
-  # now create list of all combinations:
-  result <- list(model='fitMadeUp', 
-                 param= split(expand.grid(a=a, 
-                                          b=b,
-                                          seed=ifelse(is.null(seed),'NULL', seed)),
-                              1:(length(a)*length(b)  )),
-                 name='Made Up Algorithm'
-  )
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-    
-  
-}
-
-
-

-fit

-
fitMadeUp <- function(population, plpData, param, quiet=F,
-                        outcomeId, cohortId, ...){
-    if(!quiet)
-    writeLines('Training Made Up model')
-  
-  if(param[[1]]$seed!='NULL')
-    set.seed(param[[1]]$seed)
-  
-    # check plpData is coo format:
-  if(!'ffdf'%in%class(plpData$covariates) )
-    stop('This algorithm requires plpData in coo format')
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  #TODO - how to incorporate indexes?
-  
-  # convert data into sparse R Matrix:
-  result <- toSparseM(plpData,population,map=NULL)
-  data <- result$data
-  
-  data <- data[population$rowId,]
-  
-  # set test/train sets (for printing performance as it trains)
-  if(!quiet)
-    writeLines(paste0('Training made up model on train set containing ', nrow(population), ' people with ',sum(population$outcomeCount>0), ' outcomes'))
-  start <- Sys.time()
-  
-  #============= STEP 1 ======================================
-  # pick the best hyper-params and then do final training on all data...
-  writeLines('train')
-  datas <- list(population=population, data=data)
-  param.sel <- lapply(param, function(x) do.call(made_up_model, c(x,datas)  ))
-  hyperSummary <- do.call(rbind, lapply(param.sel, function(x) x$hyperSum))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(param.sel, function(x) x$auc)) 
-  param.sel <- unlist(lapply(param.sel, function(x) x$auc))
-  param <- param[[which.max(param.sel)]]
-  
-  # set this so you do a final model train 
-  param$final=T
-  
-  writeLines('final train')
-  trainedModel <- do.call(made_up_model, c(param,datas)  )$model
-  
-  comp <- Sys.time() - start
-  if(!quiet)
-    writeLines(paste0('Model Made Up trained - took:',  format(comp, digits=3)))
-  
-  # construct the standard output for a model:
-  result <- list(model = trainedModel,
-                 modelSettings = list(model='made_up', modelParameters=param),
-                 trainCVAuc = NULL,
-                 hyperParamSearch = hyperSummary,
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,# can use populationSettings$outcomeId?
-                 cohortId=cohortId,
-                 varImp = NULL,
-                 trainingTime=comp,
-                 covariateMap=result$map
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'madeup'
-  attr(result, 'predictionType') <- 'binary'
-  return(result)
-    
-}
-
-
-

-helpers

-

In the fit model I specified calling made_up_model, this is the function that trains a model given the data and population (where the popualtion contains a column outcomeCount corresponding to the outcome). Both the data and population are ordered the same way:

-
made_up_model <- function(data, population, a = 1, b = 1, final = F, ...) {
-    
-    writeLines(paste("Training Made Up model with ", length(unique(population$indexes)), 
-        " fold CV"))
-    if (!is.null(population$indexes) && final == F) {
-        index_vect <- unique(population$indexes)
-        perform <- c()
-        
-        # create prediction matrix to store all predictions
-        predictionMat <- population
-        predictionMat$value <- 0
-        attr(predictionMat, "metaData") <- list(predictionType = "binary")
-        
-        for (index in 1:length(index_vect)) {
-            writeLines(paste("Fold ", index, " -- with ", sum(population$indexes != 
-                index), "train rows"))
-            model <- madeup::model(x = data[population$indexes != index, ], 
-                y = population$outcomeCount[population$indexes != index], a = a, 
-                b = b)
-            
-            pred <- stats::predict(model, data[population$indexes == index, 
-                ])
-            prediction <- population[population$indexes == index, ]
-            prediction$value <- pred
-            attr(prediction, "metaData") <- list(predictionType = "binary")
-            aucVal <- computeAuc(prediction)
-            perform <- c(perform, aucVal)
-            
-            # add the fold predictions and compute AUC after loop
-            predictionMat$value[population$indexes == index] <- pred
-            
-        }
-        ## auc <- mean(perform) # want overal rather than mean
-        auc <- computeAuc(predictionMat)
-        
-        foldPerm <- perform
-    } else {
-        model <- madeup::model(x = data, y = population$outcomeCount, a = a, 
-            b = b)
-        
-        pred <- stats::predict(model, data)
-        prediction <- population
-        prediction$value <- pred
-        attr(prediction, "metaData") <- list(predictionType = "binary")
-        auc <- computeAuc(prediction)
-        foldPerm <- auc
-    }
-    
-    result <- list(model = model, auc = auc, hyperSum = unlist(list(a = a, b = b, 
-        fold_auc = foldPerm)))
-    return(result)
-}
-
-
-

-Predict

-

The final step is to create a predict function for the model. This gets added to the predict.R file. In the example above the type attr(result, 'type') <- 'madeup' was madeup, so a predict.madeup function is required to be added into the predict.R. The predict function needs to take as input the plpModel returned by the fit function, the population to apply the model on and the plpData specifying the covariates of the population.

-
predict.madeup <- function(plpModel, population, plpData, ...) {
-    result <- toSparseM(plpData, population, map = plpModel$covariateMap)
-    data <- result$data[population$rowId, ]
-    prediction <- data.frame(rowId = population$rowId, value = stats::predict(plpModel$model, 
-        data))
-    
-    prediction <- merge(population, prediction, by = "rowId")
-    prediction <- prediction[, colnames(prediction) %in% c("rowId", "outcomeCount", 
-        "indexes", "value")]  # need to fix no index issue
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    return(prediction)
-    
-}
-

As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the tpye of the result returned by fitMadeUpModel to attr(result, 'type') <- 'xgboost'.

-
-
-
- - - -
- - -
- -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - diff --git a/docs/articles/Figure1.png b/docs/articles/Figure1.png deleted file mode 100644 index 878a509e2..000000000 Binary files a/docs/articles/Figure1.png and /dev/null differ diff --git a/docs/articles/Figure1.webp b/docs/articles/Figure1.webp new file mode 100644 index 000000000..42ad71d7f Binary files /dev/null and b/docs/articles/Figure1.webp differ diff --git a/docs/articles/GeneratingLearningCurves.html b/docs/articles/GeneratingLearningCurves.html deleted file mode 100644 index cb9467f1d..000000000 --- a/docs/articles/GeneratingLearningCurves.html +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - -Generating Learning Curves • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - -
-

-Introduction

-

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package to generate learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette.

-

Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, best-practice is to partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set.

-

Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.

-
-

Learning curve example.

-
-

Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance.

-

The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.

-
-

Prediction model suffering from high variance.

-
-

Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (for example non-linear) relationships in the data may be an alternative approach to consider in this high bias situation.

-
-

Prediction model suffering from high bias.

-
-
-
-

-Generating the learning curve

-

Use the PatientLevelPrediction package to generate a population and plpData object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = "INFO"
-)
-

Specify the prediction algorithm to be used.

-
# Use LASSO logistic regression
-modelSettings <- setLassoLogisticRegression()
-

Specify a test fraction and a sequence of training set fractions.

-
testFraction <- 0.2
-trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions
-

Specify the test split to be used.

-
# Use a split by person, alternatively a time split is possible
-testSplit <- 'stratified'
-

Create the learning curve object.

-
learningCurve <- createLearningCurve(population,
-                                     plpData = plpData,
-                                     modelSettings = modelSettings,
-                                     testFraction = 0.2,
-                                     verbosity = "TRACE",
-                                     trainFractions = trainFractions,
-                                     splitSeed = 1000,
-                                     saveModel = TRUE)
-

Plot the learning curve object (Figure 4). Specify one of the available metrics: AUROC, AUPRC, sBrier.

-
plotLearningCurve(
-  learningCurve,
-  metric='AUROC',
-  plotTitle = 'Learning Curve',
-  plotSubtitle = 'AUROC performance'
-)
-
-

Learning curve plot.

-
-
-
-

-Parallel processing

-

The learning curve object can be created in parallel, which can reduce computation time significantly. Currently this functionality is only available for LASSO logistic regression. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis. Logging and saving functionality is unavailable.

-

Use the parallelized version of the learning curve function to create the learning curve object in parallel. R will find the number of available processing cores automatically and register the required parallel backend.

-
learningCurvePar <- createLearningCurvePar(
-  population,
-  plpData =  plpData,
-  modelSettings = modelSettings,
-  testSplit = testSplit,
-  testFraction = testFraction,
-  trainFractions = trainFractions,
-  splitSeed = 1000
-)
-
-
-

-Demo

-

We have added a demo of the learningcurve:

-
# Show all demos in our package: 
- demo(package = "PatientLevelPrediction")
-
-# Run the learning curve
- demo("LearningCurveDemo", package = "PatientLevelPrediction")
-

Do note that running this demo can take a considerable amount of time (15 min on Quad core running in parallel)!

-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-
-
- - - -
- - - -
- -
-

Site built with pkgdown 1.5.1.

-
- -
-
- - - - - - diff --git a/docs/articles/ImplementingExistingModels.html b/docs/articles/ImplementingExistingModels.html deleted file mode 100644 index 2bb38a32e..000000000 --- a/docs/articles/ImplementingExistingModels.html +++ /dev/null @@ -1,359 +0,0 @@ - - - - - - - -Implementing Existing Prediction Models using the OHDSI PatientLevelPrediction Framework • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - -
-

-Introduction

-

This vignette describes how you can implement existing logistic regression models in the Observational Health Data Sciences (OHDSI) PatientLevelPrediction framework. This allows you to for example externally validate them at scale in the OHDSI data network.

-

As an example we are going to implement the CHADS2 model as described in:

-

Gage BF, Waterman AD, Shannon W, Boechler M, Rich MW, Radford MJ. Validation of clinical classification schemes for predicting stroke: results from the National Registry of Atrial Fibrillation. JAMA. 2001 Jun 13;285(22):2864-70

-

To implement the model you need to create three tables: the model table, the covariate table, and the intercept table. The model table specifies the modelId (sequence number), the modelCovariateId (sequence number) and the covariateValue (beta for the covariate). The covariate table specifies the mapping between the covariates from the published model and the standard covariates, i.e. it maps to a combination of an analysisid and a concept_id (see below). The intercept table specifies per modelId the intercept.

-
-
-

-Model implementation

-
-

-Define the model

-

The CHADS2 is a score based model with:

-
##   Points                        Covariate
-## 1      1         Congestive heart failure
-## 2      1                     Hypertension
-## 3      1                  Age >= 75 years
-## 4      1                Diabetes mellitus
-## 5      2 Stroke/transient ischemic attack
-

The model table should therefore be defined as:

-
##   modelId modelCovariateId covariateValue
-## 1       1                1              1
-## 2       1                2              1
-## 3       1                3              1
-## 4       1                4              1
-## 5       1                5              2
-

The covariateTable will then specify what standard covariates need to be included in the model.

-

In this case we choose the following Standard SNOMED concept_ids: 319835 for congestive heart failure, 316866 for hypertensive disorder, 201820 for diabetes, and 381591 for cerebrovascular disease. It is allowed to add multiple concept_ids as separate rows for the same modelCovariateId if concept sets are needed. These concept_ids can be found using the vocabulary search in ATLAS.

-

The standard covariates are of the form: conceptid*1000 + analysisid. The analysisid specifies the domain of the covariate and its lookback window. Examples can be found here: https://github.com/OHDSI/FeatureExtraction/blob/master/inst/csv/PrespecAnalyses.csv

-

Our example of CHADS2 uses agegroup and conditions in the full history. Therefore we need to define the standard covariates using the FeatureExtraction::createCovariateSettings as follows:

-
library(PatientLevelPrediction)
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = T,                             
-                                                     useConditionOccurrenceLongTerm = T,
-                                                     includedCovariateIds = NULL,
-                                                     longTermStartDays = -9999, 
-                                                     endDays = 0)
-

In the above code we used the useConditionOccurrenceLongTerm (these have an analysis id of 102) and we defined the longTermStartDays to be -9999 days relative to index (so we get the full history). We include the index date in our lookback period by specifying endDays = 0. The includeCovariateIds is set to NULL here, but this will be updated automatically later on. As we picked analysis id 102, the standard covariate for anytime prior congestive heart failure is 319835102. The same logic follows for the other conditions, so the covariate table will be:

-
##   modelCovariateId covariateId
-## 1                1   319835102
-## 2                2   316866102
-## 3                3       15003
-## 4                3       16003
-## 5                3       17003
-## 6                3       18003
-## 7                3       19003
-## 8                4   201820102
-## 9                5   381591102
-

modelCovariateId 3 was age>= 75, as the standard covariate age groups are in 5 year groups, we needed to add the age groups 75-80, 80-85, 85-90, 90-95 and 95-100, these correspond to the covaraiteIds 15003, 16003, 17003, 18003 and 19003 respectively.

-

To create the tables in R for CHADS2 you need to make the following dataframes:

-
model_table <- data.frame(modelId = c(1,1,1,1,1),
-                          modelCovariateId = 1:5, 
-                          coefficientValue = c(1, 1, 1, 1, 2)
-                          )
-
-covariate_table <- data.frame(modelCovariateId = c(1,2,3,3,3,3,3,4,5),
-                              covariateId = c(319835102, 316866102, 
-                                            15003, 16003, 17003, 18003, 19003, 
-                                            201820102, 381591102)
-                              )
-
-interceptTable <-  data.frame(modelId = 1, 
-                              interceptValue = 0)
-
-
-

-Create the model

-

Now you have everything in place to actually create the existing model. First specify the current environment as executing createExistingModelSql creates two functions for running the existing model into the specified environment. You need to specify the type of model (either logistic or score), in our example we are calculating a score. We finally need to specify the analysisId for the newly created CHADS2 covariate.

-
e <- environment()
-PatientLevelPrediction::createExistingModelSql(modelTable = model_table, 
-                       modelNames = 'CHADS2', 
-                       interceptTable = data.frame(modelId = 1, interceptValue = 0),
-                       covariateTable = covariate_table, 
-                       type = 'score',
-                       analysisId = 112, covariateSettings = covSettings, e = e)
-

Once run you will find two new functions in your environment:

-
    -
  • createExistingmodelsCovariateSettings()
  • -
  • getExistingmodelsCovariateSettings()
  • -
-
-
-

-Run the model

-

Now you can use the functions you previously created to extract the existing model risk scores for a target population:

-
plpData <- PatientLevelPrediction::getPlpData(connectionDetails, 
-                      cdmDatabaseSchema = 'databasename.dbo',
-                      cohortId = 1,
-                      outcomeIds = 2, 
-                      cohortDatabaseSchema = 'databasename.dbo', 
-                      cohortTable =  'cohort' , 
-                      outcomeDatabaseSchema = 'databasename.dbo', 
-                      outcomeTable = 'cohort', 
-                      covariateSettings =  createExistingmodelsCovariateSettings(),
-                      sampleSize = 20000
-                      )
-

To implement and evaluate an existing model you can use the function:

-

PatientLevelPrediction::evaluateExistingModel()

-

with the following parameters:

-
    -
  • modelTable - a data.frame containing the columns: modelId, covariateId and coefficientValue
  • -
  • covariateTable - a data.frame containing the columns: covariateId and standardCovariateId - this provides a set of standardCovariateId to define each model covariate.
  • -
  • interceptTable - a data.frame containing the columns modelId and interceptValue or NULL if the model doesn’t have an intercept (equal to zero).
  • -
  • type - the type of model (either: score or logistic)
  • -
  • covariateSettings - this is used to determine the startDay and endDay for the standard covariates
  • -
  • customCovariates - a data.frame with the covariateId and sql to generate the covariate value.
  • -
  • riskWindowStart - the time at risk starts at target cohort start date + riskWindowStart
  • -
  • addExposureDaysToEnd - if true then the time at risk window ends a the cohort end date + riskWindowEnd rather than cohort start date + riskWindowEnd
  • -
  • riskWindowEnd - the time at risk ends at target cohort start/end date + riskWindowStart
  • -
  • requireTimeAtRisk - whether to add a constraint to the number of days observed during the time at risk period in including people into the study
  • -
  • minTimeAtRisk - the minimum number of days observation during the time at risk a target population person needs to be included
  • -
  • includeAllOutcomes - Include outcomes even if they do not satisfy the minTimeAtRisk? (useful if the outcome is associated to death or rare)
  • -
  • removeSubjectsWithPriorOutcome - remove target population people who have the outcome prior to the time at tisk period?
  • -
  • connectionDetails - the connection to the CDM database
  • -
-

Finally you need to add the settings for downloading the new data:

-
    -
  • cdmDatabaseSchema
  • -
  • cohortDatabaseSchema
  • -
  • cohortTable
  • -
  • cohortId
  • -
  • outcomeDatabaseSchema
  • -
  • outcomeTable
  • -
  • outcomeId
  • -
  • oracleTempSchema
  • -
-

To run the external validation of an existing model where the target population are those in the cohort table with id 1 and the outcome is those in the cohort table with id 2 and we are looking to predict first time occurrence of the outcome 1 day to 365 days after the target cohort start date (assuming you have the modelTable, covariateTable and interceptTable in the format explained above):

-
# in our example the existing model uses gender and condition groups looking back 200 days:
-covSet <- FeatureExtraction::createCovariateSettings(useDemographicsGender = T,
-                                                     useConditionGroupEraMediumTerm = T, 
-                                                     mediumTermStartDays = -200)
-
-result <- evaluateExistingModel(modelTable = modelTable,
-                                covariateTable = covariateTable,
-                                interceptTable = NULL,
-                                type = 'score', 
-                                covariateSettings =  covSet,
-                                riskWindowStart = 1, 
-                                addExposureDaysToEnd = F, 
-                                riskWindowEnd = 365, 
-                                requireTimeAtRisk = T, 
-                                minTimeAtRisk = 364, 
-                                includeAllOutcomes = T, 
-                                removeSubjectsWithPriorOutcome = T, 
-                                connectionDetails = connectionDetails, 
-                                cdmDatabaseSchema = 'databasename.dbo',
-                                cohortId = 1,
-                                outcomeId = 2, 
-                                cohortDatabaseSchema = 'databasename.dbo', 
-                                cohortTable =  'cohort' , 
-                                outcomeDatabaseSchema = 'databasename.dbo', 
-                                outcomeTable = 'cohort'
-                      )
-

Result will contain the performance and the predictions made by the model.

-
-
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

-
citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018).
-## "Design and implementation of a standardized framework to generate
-## and evaluate patient-level prediction models using observational
-## healthcare data." _Journal of the American Medical Informatics
-## Association_, *25*(8), 969-975. <URL:
-## https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

-
-
- - - -
- - -
- -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - diff --git a/docs/articles/InstallationGuide.html b/docs/articles/InstallationGuide.html index 29f442285..054ce33de 100644 --- a/docs/articles/InstallationGuide.html +++ b/docs/articles/InstallationGuide.html @@ -19,6 +19,8 @@ + +
+
@@ -146,142 +161,160 @@

2022-03-09

-
-

-Introduction

-

This vignette describes how you need to install the Observational Health Data Sciencs and Informatics (OHDSI) PatientLevelPrediction package under Windows, Mac, and Linux.

+
+

Introduction +

+

This vignette describes how you need to install the Observational +Health Data Science and Informatics (OHDSI) PatientLevelPrediction +package under Windows, Mac, and Linux.

-
-

-Software Prerequisites

-
-

-Windows Users

-

Under Windows the OHDSI Patient Level Prediction (PLP) package requires installing:

+
+

Software Prerequisites +

+
+

Windows Users +

+

Under Windows the OHDSI Patient Level Prediction (PLP) package +requires installing:

-
-

-Mac/Linux Users

-

Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package requires installing:

+
+

Mac/Linux Users +

+

Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package +requires installing:

-
-

-Installing the Package

-

The preferred way to install the package is by using remotes, which will automatically install the latest release and all the latest dependencies.

-

If you do not want the official release you could install the bleading edge version of the package (latest develop branch).

-

Note that the latest develop branch could contain bugs, please report them to us if you experience problems.

-
-

-Installing PatientLevelPrediction using remotes

+
+

Installing the Package +

+

The preferred way to install the package is by using +remotes, which will automatically install the latest +release and all the latest dependencies.

+

If you do not want the official release you could install the +bleeding edge version of the package (latest develop branch).

+

Note that the latest develop branch could contain bugs, please report +them to us if you experience problems.

+
+

Installing PatientLevelPrediction using remotes +

To install using remotes run:

-install.packages("remotes")
-remotes::install_github("OHDSI/FeatureExtraction")
-remotes::install_github("OHDSI/PatientLevelPrediction")
-

When installing make sure to close any other Rstudio sessions that are using PatientLevelPrediction or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing.

+install.packages("remotes") +remotes::install_github("OHDSI/PatientLevelPrediction")
+

When installing make sure to close any other Rstudio sessions that +are using PatientLevelPrediction or any dependency. Keeping +Rstudio sessions open can cause locks that prevent the package +installing.

-
-

-Creating Python Reticulate Environment

-

Many of the classifiers in the PatientLevelPrediction use a Python back end. To set up a python environment run:

+
+

Creating Python Reticulate Environment +

+

Many of the classifiers in the PatientLevelPrediction +use a Python backend. To set up a python environment run:

-library(PatientLevelPrediction)
-reticulate::install_miniconda()
-configurePython(envname='r-reticulate', envtype='conda')
-

Some of the less frequently used classifiers are not installed during this set-up to add them run:

-

For GBM survival:

-
-reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp')
-
-
-

-Testing installation

-

To test whether the package is installed correctly, using the test script in ‘/extras’, run:

-
-# load the checkPlpInstallation function
-library(devtools)
-source_url('https://raw.github.com/OHDSI/PatientLevelPrediction/issue242/extras/checkPlpInstallation.R')
-
-# set up the database connection details
-library(DatabaseConnector)
-connectionDetails <- createConnectionDetails(
-  dbms = 'sql_server', 
-  user = 'username', 
-  password = 'hidden', 
-  server = 'your server', 
-  port = 'your port'
-  )
-
-# run the test
-checkPlpInstallation(
-  connectionDetails = connectionDetails, 
-  python = T
-  )
-

To test the installation (excluding python) run:

-
-checkPlpInstallation(
-  connectionDetails = connectionDetails, 
-  python = F
-  )
-

The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. Moreover, it will test the database connection.

+library(PatientLevelPrediction) +reticulate::install_miniconda() +configurePython(envname='r-reticulate', envtype='conda')
-
-

-Installation issues

-

Installation issues need to be posted in our issue tracker: http://github.com/OHDSI/PatientLevelPrediction/issues

+
+

Installation issues +

+

Installation issues need to be posted in our issue tracker: http://github.com/OHDSI/PatientLevelPrediction/issues

The list below provides solutions for some common issues:

    -
  1. If you have an error when trying to install a package in R saying ‘Dependancy X not available …’ then this can sometimes be fixed by running install.packages('X') and then once that completes trying to reinstall the package that had the error.

  2. -
  3. I have found that using the github `remotes`` to install packages can be impacted if you have multiple R sessions open as one session with a library open can cause the library to be locked and this can prevent an install of a package that depends on that library.

  4. +
  5. If you have an error when trying to install a package in R saying +‘Dependancy X not available …’ then this can sometimes +be fixed by running install.packages('X') and then once +that completes trying to reinstall the package that had the +error.

  6. +
  7. I have found that using the github remotes to +install packages can be impacted if you have multiple R +sessions open as one session with a library open can cause the +library to be locked and this can prevent an install of a package that +depends on that library.

+
+

Common issues +

+
+

python environment Mac/linux users: +

+

to make sure R uses the r-reticulate python environment you may need +to edit your .Rprofile with the location of the python binary for the +PLP environment. Edit the .Rprofile by running:

+
+usethis::edit_r_profile()
+

and add

+
+Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":"))
+

to the file then save. Where your python bin location is the location +returned by

+
+reticulate::conda_list() 
+

e.g., My PLP virtual environment location was +/anaconda3/envs/PLP/bin/python so I added:
+Sys.setenv(PATH = paste(“/anaconda3/envs/PLP/bin”, Sys.getenv(“PATH”), +sep=“:”))

+
-
-

-Acknowledgments

-

Considerable work has been dedicated to provide the PatientLevelPrediction package.

+
+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the +PatientLevelPrediction package.

-citation("PatientLevelPrediction")
-
## 
-## To cite PatientLevelPrediction in publications use:
-## 
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
-## and implementation of a standardized framework to generate and evaluate
-## patient-level prediction models using observational healthcare data."
-## _Journal of the American Medical Informatics Association_, *25*(8),
-## 969-975. <URL: https://doi.org/10.1093/jamia/ocy032>.
-## 
-## A BibTeX entry for LaTeX users is
-## 
-##   @Article{,
-##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-##     journal = {Journal of the American Medical Informatics Association},
-##     volume = {25},
-##     number = {8},
-##     pages = {969-975},
-##     year = {2018},
-##     url = {https://doi.org/10.1093/jamia/ocy032},
-##   }
-

Please reference this paper if you use the PLP Package in your work:

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

-

This work is supported in part through the National Science Foundation grant IIS 1251151.

+citation("PatientLevelPrediction")
+
## 
+## To cite PatientLevelPrediction in publications use:
+## 
+##   Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design
+##   and implementation of a standardized framework to generate and
+##   evaluate patient-level prediction models using observational
+##   healthcare data." _Journal of the American Medical Informatics
+##   Association_, *25*(8), 969-975.
+##   <https://doi.org/10.1093/jamia/ocy032>.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Article{,
+##     author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
+##     title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
+##     journal = {Journal of the American Medical Informatics Association},
+##     volume = {25},
+##     number = {8},
+##     pages = {969-975},
+##     year = {2018},
+##     url = {https://doi.org/10.1093/jamia/ocy032},
+##   }
+

Please reference this paper if you use the PLP Package in +your work:

+

Reps JM, Schuemie +MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a +standardized framework to generate and evaluate patient-level prediction +models using observational healthcare data. J Am Med Inform Assoc. +2018;25(8):969-975.

+

This work is supported in part through the National Science +Foundation grant IIS 1251151.

@@ -296,11 +329,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -309,5 +344,7 @@

+ + diff --git a/docs/articles/InstallationGuide_files/header-attrs-2.11/header-attrs.js b/docs/articles/InstallationGuide_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/InstallationGuide_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/InstallationGuide_files/header-attrs-2.7/header-attrs.js b/docs/articles/InstallationGuide_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/InstallationGuide_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/PatientLevelPrediction.html b/docs/articles/PatientLevelPrediction.html deleted file mode 100644 index 1b8d82727..000000000 --- a/docs/articles/PatientLevelPrediction.html +++ /dev/null @@ -1,301 +0,0 @@ - - - - - - - -Quick Install Guide • PatientLevelPrediction - - - - - - - - - - -
-
- - - - -
-
- - - - - -
-

-Quick Install Guide

-
-
-

-Intalling the R package

-

The preferred way to install the package is by using drat, which will automatically install the latest release and all the latest dependencies. If the drat code fails or you do not want the official release you could use devtools to install the bleading edge version of the package (latest master). Note that the latest master could contain bugs, please report them to us if you experience problems.

-

To install using drat run:

-
-install.packages("drat")
-drat::addRepo("OHDSI")
-install.packages("PatientLevelPrediction")
-

To install using devtools run:

-
-install.packages('devtools')
-devtools::install_github("OHDSI/FeatureExtraction")
-devtools::install_github('ohdsi/PatientLevelPrediction')
-

When installing using devtools make sure to close any other Rstudio sessions that are using PatientLevelPrediction or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing.

-
-
-

-Setting up Python

-

Many of the classifiers in PatientLevelPrediction use python. To use the python classifiers you need to install and set up the a python environment in R. We used the reticulate package:

-
-library(PatientLevelPrediction)
-reticulate::install_miniconda()
-configurePython(envname='r-reticulate', envtype='conda')
-

To add the R keras interface, in Rstudio run:

-
-devtools::install_github("rstudio/keras")
-library(keras)
-install_keras()
-

Some of the less frequently used classifiers are considered optional and are not installed by default. To install then, run:

-

For GBM survival:

-
-reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp')
-

For any of the torch models:

-
-reticulate::conda_install(envname='r-reticulate', packages = c('pytorch', 'torchvision', 'cpuonly'), forge = TRUE, pip = FALSE, channel = 'pytorch', pip_ignore_installed = TRUE, conda = 'auto')
-
-
-

-Testing the PatientLevelPrediction Installation

-

To test whether the package is installed correctly run:

-
-library(PatientLevelPrediction)
-library(DatabaseConnector)
-connectionDetails <- createConnectionDetails(dbms = 'sql_server', 
-                                             user = 'username', 
-                                             password = 'hidden', 
-                                             server = 'your server', 
-                                             port = 'your port')
-PatientLevelPrediction::checkPlpInstallation(connectionDetails = connectionDetails, 
-                                             python = T)
-

To test the installation (excluding python) run:

-
-library(PatientLevelPrediction)
-library(DatabaseConnector)
-connectionDetails <- createConnectionDetails(dbms = 'sql_server', 
-                                           user = 'username', 
-                                           password = 'hidden', 
-                                           server = 'your server', 
-                                           port = 'your port')
-PatientLevelPrediction::checkPlpInstallation(connectionDetails = connectionDetails, 
-                                             python = F)
-

The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. Moreover, it will test the database connection.

-
-
-
-

-Common issues

-
-

-python environment Mac/linux users:

-

to make sure R uses the r-reticulate python environment you may need to edit your .Rprofile with the location of the python binary for the PLP environment. Edit the .Rprofile by running:

-
-usethis::edit_r_profile()
-

and add

-
-Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":"))
-

to the file then save. Where your python bin location is the location returned by

-
-reticulate::conda_list() 
-

e.g., My PLP virtual environment location was /anaconda3/envs/PLP/bin/python so I added:
-Sys.setenv(PATH = paste(“/anaconda3/envs/PLP/bin”, Sys.getenv(“PATH”), sep=“:”))

-
-
-
-

-Old Instructions

-
-

-To configure python via anaconda

-
    -
  • Close your RStudio
  • -
  • Install python 3.7 using anaconda (https://www.anaconda.com/download) [make sure you pick the correct operating system] and note the installation location. Anaconda should update you path variable with the python binary.
  • -
  • Open a new Rstudio and check whether your python is configured correctly by running:
  • -
-
-system("python --version") 
-

If set up correctly you should see “Python 3.x.x :: Anaconda, Inc.” returned.

-
    -
  • If not set up correctly then: -
      -
    • Windows users: make sure your anaconda python binary is in the System PATH environmental variable: go to my computer -> system properties -> advanced system settings Then at the bottom right you’ll see a button: Environmental Variables, clicking on that will enable you to edit the PATH variable. Add the following Anaconda locations to your path: D:\Anaconda3;D:\Anaconda3\Scripts;D:\Anaconda3\Library\bin (this assumes your installation was done to D:\Anaconda3).
    • -
    • Mac/Linux users: edit the bash profile to add python in the Path by running in the terminal: touch ~/.bash_profile; open ~/.bash_profile; and adding in the location of python in the PATH variable. Unfortunately, you also need to make an edit to the .Rprofile for R to get the correct PATH. To do this open the .Rprofile by running:
    • -
    -
  • -
-
-  usethis::edit_r_profile()
-

and in this file add

-
-Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":"))
-
    -
  • After editing your Path or .Rprofile open a new Rstudio session and test that python is correctly set up via
  • -
-
-system("python --version")
-
-
-
- - - -
- - - -
- -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - diff --git a/docs/articles/PatientLevelPrediction_files/header-attrs-2.11/header-attrs.js b/docs/articles/PatientLevelPrediction_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/PatientLevelPrediction_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/PatientLevelPrediction_files/header-attrs-2.7/header-attrs.js b/docs/articles/PatientLevelPrediction_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/PatientLevelPrediction_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/PlottingLearningCurves.html b/docs/articles/PlottingLearningCurves.html deleted file mode 100644 index e6cf4939c..000000000 --- a/docs/articles/PlottingLearningCurves.html +++ /dev/null @@ -1,276 +0,0 @@ - - - - - - - -Plotting learning curves • PatientLevelPrediction - - - - - - - - - -
-
- - - -
-
- - - - - -
-

-Introduction

-

Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, we generally partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set.

-

Learning curves inform about the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem. Learning curves objects can be created and plotted with the PatientLevelPrediction package.

-
-

-Background

-

Figure 1 shows a commonly observed learning curve plot, where model performance is mapped to the vertical axis and training set size is mapped to the horizontal axis. If training set size is small, the performance on the training set is high, because a model can generally be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the testing set performance increasing.

-
-Figure 1. Learning curve plot with model performance mapped to the vertical axis and training set size mapped to the horizontal axis.

Figure 1. Learning curve plot with model performance mapped to the vertical axis and training set size mapped to the horizontal axis.

-
-
-
-

-Bias and variance

-

We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.

-
-Figure 2. Prediction model suffering from high variance.

Figure 2. Prediction model suffering from high variance.

-
-

Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (potentiallly non-linear) relationships in the data may be an alternative approach to consider.

-
-Figure 3. Prediction model suffering from high bias.

Figure 3. Prediction model suffering from high bias.

-
-
-
-
-

-Usage

-

Use the OHDSI tool ecosystem to generate a population and plpData object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients.

-
set.seed(1234)
-data(plpDataSimulationProfile)
-sampleSize <- 12000
-plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize
-)
-
-population <- createStudyPopulation(
-  plpData,
-  outcomeId = 2,
-  binary = TRUE,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = FALSE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = FALSE,
-  minTimeAtRisk = 0,
-  riskWindowStart = 0,
-  addExposureDaysToStart = FALSE,
-  riskWindowEnd = 365,
-  addExposureDaysToEnd = FALSE,
-  verbosity = futile.logger::INFO
-)
-

Specify the prediction algorithm to be used.

-
# Use LASSO logistic regression
-modelSettings <- setLassoLogisticRegression()
-

Specify a test fraction and a sequence of training set fractions.

-
testFraction <- 0.2
-trainFractions <- seq(0.1, 0.8, 0.1)
-

Specify the test split to be used.

-
# Use a split by person, alterantively a time split is possible
-testSplit <- 'person'
-

Create the learning curve object.

-
learningCurve <- createLearningCurve(
-  population,
-  plpData = plpData,
-  modelSettings = modelSettings,
-  testFraction = testFraction,
-  trainFractions = trainFractions,
-  splitSeed = 1000,
-  saveModel = FALSE,
-  timeStamp = FALSE
-)
-

Plot the learning curve object (Figure 4). Specify one of the available metrics: AUROC, AUPRC, sBrier.

-
plotLearningCurve(
-  learningCurve,
-  metric='AUROC',
-  plotTitle = 'Learning Curve',
-  plotSubtitle = 'AUROC performance'
-)
-
-Figure 4. Learning curve plot.

Figure 4. Learning curve plot.

-
-
-

-Parallel processing

-

The learning curve object can be created in parallel, which can reduce computation time significantly. Currently this functionality is only available for LASSO logistic regression. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis. Logging and saving functionality is unavailable.

-

Use the parallelized version of the learning curve function to create the learning curve object in parallel. R will find the number of available processing cores automatically and register the required parallel backend.

-
learningCurvePar <- createLearningCurvePar(
-  population,
-  plpData =  plpData,
-  modelSettings = modelSettings,
-  testSplit = testSplit,
-  testFraction = testFraction,
-  trainFractions = trainFractions,
-  splitSeed = 1000
-)
-
-
-
- - - -
- - -
- -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - diff --git a/docs/articles/Videos.html b/docs/articles/Videos.html index 8f0d7b5f9..7277e6d0f 100644 --- a/docs/articles/Videos.html +++ b/docs/articles/Videos.html @@ -19,6 +19,8 @@ + +
+
@@ -146,183 +161,123 @@

2022-03-09

-
-

-What is a cohort table?

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn what a cohort table looks like and what columns are required. -
-
-
-

-Setting up a connection between your database and R

- - - - - - - - - -
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. -
-
-
-

-Running a single PatientLevelPrediction model

+
+

What is a cohort table? +

- - - - - - - - ++++ + + + + + + + +
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to develop and validate a single PatientLevelPrediction model. -
Click To LaunchDescription of Demo
Video Vignette PLP PackageLearn what a cohort table looks like and what columns are +required.
-
-

-Running multiple PatientLevelPrediction models study

+
+

Setting up a connection between your database and R +

- - - - - - - - ++++ + + + + + + + +
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to develop and validate multiple PatientLevelPrediction models. -
Click To LaunchDescription of Demo
Video Vignette PLP PackageLearn how to configure the connection to your OMOP CDM data from R +using the OHDSI DatabaseConnector package.
-
-

-Designing a study in Atlas

+
+

Running a single PatientLevelPrediction model +

- - - - - - - - ++++ + + + + + + + +
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to design a multiple or single PatientLevelPrediction study using Atlas. Atlas creates an R package that just needs to be built and then you’re on your way to developing multiple models! -
Click To LaunchDescription of Demo
Video Vignette PLP PackageLearn how to develop and validate a single PatientLevelPrediction +model.
-
-

-Building and running the Atlas study

+
+

Running multiple PatientLevelPrediction models study +

- - - - - - - - ++++ + + + + + + + +
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to build the R package generated by Atlas and how to then run the study. -
Click To LaunchDescription of Demo
Video Vignette PLP PackageLearn how to develop and validate multiple PatientLevelPrediction +models.
-
-

-Exploring the results in the shiny app

+
+

Exploring the results in the shiny app +

- - - - - - - - ++++ + + + + + + + +
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() -
Click To LaunchDescription of Demo
Video Vignette PLP PackageLearn how to interactively explore the model performance and model +via the shiny apps viewPlp() and viewMultiplePlp()
-
-

-Validating existing models on OMOP CDM data

+
+

Validating existing models on OMOP CDM data +

- - - - - - - - ++++ + + + + + + + +
-Click To Launch - -Description of Demo -
-Video Vignette PLP Package - -This demo shows how you can add any existing score or logistic model and valdiate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. -
Click To LaunchDescription of Demo
Video Vignette PLP PackageThis demo shows how you can add any existing score or logistic model +and validate the model on new OMOP CDM data. This is useful for +benchmarking when developing new models or to perform extensive external +validation of a model across the OHDSI network.
@@ -338,11 +293,13 @@

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -351,5 +308,7 @@

+ + diff --git a/docs/articles/Videos_files/header-attrs-2.11/header-attrs.js b/docs/articles/Videos_files/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/Videos_files/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/Videos_files/header-attrs-2.7/header-attrs.js b/docs/articles/Videos_files/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92e0..000000000 --- a/docs/articles/Videos_files/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/arch1.png b/docs/articles/arch1.png deleted file mode 100644 index e4846e56f..000000000 Binary files a/docs/articles/arch1.png and /dev/null differ diff --git a/docs/articles/atlasdownload1.png b/docs/articles/atlasdownload1.png deleted file mode 100644 index ef6559fa9..000000000 Binary files a/docs/articles/atlasdownload1.png and /dev/null differ diff --git a/docs/articles/atlasdownload1.webp b/docs/articles/atlasdownload1.webp new file mode 100644 index 000000000..6cac340ed Binary files /dev/null and b/docs/articles/atlasdownload1.webp differ diff --git a/docs/articles/atlasdownload2.png b/docs/articles/atlasdownload2.png deleted file mode 100644 index 619f8c799..000000000 Binary files a/docs/articles/atlasdownload2.png and /dev/null differ diff --git a/docs/articles/atlasdownload2.webp b/docs/articles/atlasdownload2.webp new file mode 100644 index 000000000..452c5ca21 Binary files /dev/null and b/docs/articles/atlasdownload2.webp differ diff --git a/docs/articles/atlasplp1.png b/docs/articles/atlasplp1.png deleted file mode 100644 index 4b21b2143..000000000 Binary files a/docs/articles/atlasplp1.png and /dev/null differ diff --git a/docs/articles/atlasplp1.webp b/docs/articles/atlasplp1.webp new file mode 100644 index 000000000..71a3c1ce9 Binary files /dev/null and b/docs/articles/atlasplp1.webp differ diff --git a/docs/articles/atlasplp2.png b/docs/articles/atlasplp2.png deleted file mode 100644 index 6bc7b93ad..000000000 Binary files a/docs/articles/atlasplp2.png and /dev/null differ diff --git a/docs/articles/atlasplp3.png b/docs/articles/atlasplp3.png deleted file mode 100644 index 0911b31ea..000000000 Binary files a/docs/articles/atlasplp3.png and /dev/null differ diff --git a/docs/articles/atlasplp3.webp b/docs/articles/atlasplp3.webp new file mode 100644 index 000000000..523d0143c Binary files /dev/null and b/docs/articles/atlasplp3.webp differ diff --git a/docs/articles/atlasplp4.png b/docs/articles/atlasplp4.png deleted file mode 100644 index b5db1b153..000000000 Binary files a/docs/articles/atlasplp4.png and /dev/null differ diff --git a/docs/articles/cirenn.png b/docs/articles/cirenn.png deleted file mode 100644 index f4e8ed054..000000000 Binary files a/docs/articles/cirenn.png and /dev/null differ diff --git a/docs/articles/cnn_lstm.png b/docs/articles/cnn_lstm.png deleted file mode 100644 index a16e1417d..000000000 Binary files a/docs/articles/cnn_lstm.png and /dev/null differ diff --git a/docs/articles/cnn_mlf2.png b/docs/articles/cnn_mlf2.png deleted file mode 100644 index 2b69c159b..000000000 Binary files a/docs/articles/cnn_mlf2.png and /dev/null differ diff --git a/docs/articles/conv_arch1.png b/docs/articles/conv_arch1.png deleted file mode 100644 index 5970b3f1c..000000000 Binary files a/docs/articles/conv_arch1.png and /dev/null differ diff --git a/docs/articles/conv_arch2.png b/docs/articles/conv_arch2.png deleted file mode 100644 index a51ccf08e..000000000 Binary files a/docs/articles/conv_arch2.png and /dev/null differ diff --git a/docs/articles/covcnn.png b/docs/articles/covcnn.png deleted file mode 100644 index 82dd2832f..000000000 Binary files a/docs/articles/covcnn.png and /dev/null differ diff --git a/docs/articles/covcnn2.png b/docs/articles/covcnn2.png deleted file mode 100644 index 0734a49eb..000000000 Binary files a/docs/articles/covcnn2.png and /dev/null differ diff --git a/docs/articles/demographicSummary.png b/docs/articles/demographicSummary.png deleted file mode 100644 index 8ceafbee8..000000000 Binary files a/docs/articles/demographicSummary.png and /dev/null differ diff --git a/docs/articles/demographicSummary.webp b/docs/articles/demographicSummary.webp new file mode 100644 index 000000000..7d0437deb Binary files /dev/null and b/docs/articles/demographicSummary.webp differ diff --git a/docs/articles/ensemble.png b/docs/articles/ensemble.png deleted file mode 100644 index 6e2173a48..000000000 Binary files a/docs/articles/ensemble.png and /dev/null differ diff --git a/docs/articles/example1/ATLAS_O.png b/docs/articles/example1/ATLAS_O.png deleted file mode 100644 index 3cda2abf7..000000000 Binary files a/docs/articles/example1/ATLAS_O.png and /dev/null differ diff --git a/docs/articles/example1/ATLAS_O.webp b/docs/articles/example1/ATLAS_O.webp new file mode 100644 index 000000000..85e63dc9e Binary files /dev/null and b/docs/articles/example1/ATLAS_O.webp differ diff --git a/docs/articles/example1/ATLAS_T.png b/docs/articles/example1/ATLAS_T.png deleted file mode 100644 index 8be57dc9e..000000000 Binary files a/docs/articles/example1/ATLAS_T.png and /dev/null differ diff --git a/docs/articles/example1/ATLAS_T.webp b/docs/articles/example1/ATLAS_T.webp new file mode 100644 index 000000000..df3a8245f Binary files /dev/null and b/docs/articles/example1/ATLAS_T.webp differ diff --git a/docs/articles/example2/aceinhibitors.png b/docs/articles/example2/aceinhibitors.png deleted file mode 100644 index ce5148f1d..000000000 Binary files a/docs/articles/example2/aceinhibitors.png and /dev/null differ diff --git a/docs/articles/example2/aceinhibitors.webp b/docs/articles/example2/aceinhibitors.webp new file mode 100644 index 000000000..564f8af77 Binary files /dev/null and b/docs/articles/example2/aceinhibitors.webp differ diff --git a/docs/articles/example2/angioedema.png b/docs/articles/example2/angioedema.png deleted file mode 100644 index 3adc8dcc9..000000000 Binary files a/docs/articles/example2/angioedema.png and /dev/null differ diff --git a/docs/articles/example2/angioedema.webp b/docs/articles/example2/angioedema.webp new file mode 100644 index 000000000..8c728ce50 Binary files /dev/null and b/docs/articles/example2/angioedema.webp differ diff --git a/docs/articles/generalizability.png b/docs/articles/generalizability.png deleted file mode 100644 index b476ea71f..000000000 Binary files a/docs/articles/generalizability.png and /dev/null differ diff --git a/docs/articles/generalizability.webp b/docs/articles/generalizability.webp new file mode 100644 index 000000000..ba6d14de4 Binary files /dev/null and b/docs/articles/generalizability.webp differ diff --git a/docs/articles/index.html b/docs/articles/index.html index f48ef229e..631931e62 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -1,66 +1,12 @@ - - - - - - - -Articles • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Articles • PatientLevelPrediction - + + - - - -
-
- -
- - -
- +
- - + + diff --git a/docs/articles/lstm_last.png b/docs/articles/lstm_last.png deleted file mode 100644 index 3e6fc16e5..000000000 Binary files a/docs/articles/lstm_last.png and /dev/null differ diff --git a/docs/articles/popdef1.png b/docs/articles/popdef1.png deleted file mode 100644 index 3d654fe7d..000000000 Binary files a/docs/articles/popdef1.png and /dev/null differ diff --git a/docs/articles/popdef1.webp b/docs/articles/popdef1.webp new file mode 100644 index 000000000..83ef7afd6 Binary files /dev/null and b/docs/articles/popdef1.webp differ diff --git a/docs/articles/popdef2.png b/docs/articles/popdef2.png deleted file mode 100644 index a596e188d..000000000 Binary files a/docs/articles/popdef2.png and /dev/null differ diff --git a/docs/articles/popdef2.webp b/docs/articles/popdef2.webp new file mode 100644 index 000000000..31887dd1b Binary files /dev/null and b/docs/articles/popdef2.webp differ diff --git a/docs/articles/popdef3.png b/docs/articles/popdef3.png deleted file mode 100644 index 34527ef9f..000000000 Binary files a/docs/articles/popdef3.png and /dev/null differ diff --git a/docs/articles/popdef3.webp b/docs/articles/popdef3.webp new file mode 100644 index 000000000..8b409ed49 Binary files /dev/null and b/docs/articles/popdef3.webp differ diff --git a/docs/articles/popdef4.png b/docs/articles/popdef4.png deleted file mode 100644 index 35d4949a5..000000000 Binary files a/docs/articles/popdef4.png and /dev/null differ diff --git a/docs/articles/popdef4.webp b/docs/articles/popdef4.webp new file mode 100644 index 000000000..2709497e7 Binary files /dev/null and b/docs/articles/popdef4.webp differ diff --git a/docs/articles/popdef5.png b/docs/articles/popdef5.png deleted file mode 100644 index f6315b8a8..000000000 Binary files a/docs/articles/popdef5.png and /dev/null differ diff --git a/docs/articles/popdef5.webp b/docs/articles/popdef5.webp new file mode 100644 index 000000000..748b8901b Binary files /dev/null and b/docs/articles/popdef5.webp differ diff --git a/docs/articles/popdef6.png b/docs/articles/popdef6.png deleted file mode 100644 index 96a8abd1f..000000000 Binary files a/docs/articles/popdef6.png and /dev/null differ diff --git a/docs/articles/popdef6.webp b/docs/articles/popdef6.webp new file mode 100644 index 000000000..583dc9fba Binary files /dev/null and b/docs/articles/popdef6.webp differ diff --git a/docs/articles/precisionRecall.png b/docs/articles/precisionRecall.png deleted file mode 100644 index 1f1d0f154..000000000 Binary files a/docs/articles/precisionRecall.png and /dev/null differ diff --git a/docs/articles/precisionRecall.webp b/docs/articles/precisionRecall.webp new file mode 100644 index 000000000..af6b0cfe5 Binary files /dev/null and b/docs/articles/precisionRecall.webp differ diff --git a/docs/articles/predictionDistribution.png b/docs/articles/predictionDistribution.png deleted file mode 100644 index 87bafc361..000000000 Binary files a/docs/articles/predictionDistribution.png and /dev/null differ diff --git a/docs/articles/preferencePDF.png b/docs/articles/preferencePDF.png deleted file mode 100644 index 3b3528452..000000000 Binary files a/docs/articles/preferencePDF.png and /dev/null differ diff --git a/docs/articles/preferencePDF.webp b/docs/articles/preferencePDF.webp new file mode 100644 index 000000000..189a356be Binary files /dev/null and b/docs/articles/preferencePDF.webp differ diff --git a/docs/articles/problems.png b/docs/articles/problems.png deleted file mode 100644 index 931efa6d6..000000000 Binary files a/docs/articles/problems.png and /dev/null differ diff --git a/docs/articles/problems.webp b/docs/articles/problems.webp new file mode 100644 index 000000000..5c1c27bb4 Binary files /dev/null and b/docs/articles/problems.webp differ diff --git a/docs/articles/shinyroc.png b/docs/articles/shinyroc.png deleted file mode 100644 index 579fab31f..000000000 Binary files a/docs/articles/shinyroc.png and /dev/null differ diff --git a/docs/articles/shinyroc.webp b/docs/articles/shinyroc.webp new file mode 100644 index 000000000..a11724623 Binary files /dev/null and b/docs/articles/shinyroc.webp differ diff --git a/docs/articles/shinysummary.png b/docs/articles/shinysummary.png deleted file mode 100644 index 75cec2430..000000000 Binary files a/docs/articles/shinysummary.png and /dev/null differ diff --git a/docs/articles/shinysummary.webp b/docs/articles/shinysummary.webp new file mode 100644 index 000000000..0d256ade1 Binary files /dev/null and b/docs/articles/shinysummary.webp differ diff --git a/docs/articles/sparseCalibration.png b/docs/articles/sparseCalibration.png deleted file mode 100644 index d6e34c0cf..000000000 Binary files a/docs/articles/sparseCalibration.png and /dev/null differ diff --git a/docs/articles/sparseCalibration.webp b/docs/articles/sparseCalibration.webp new file mode 100644 index 000000000..043019e5b Binary files /dev/null and b/docs/articles/sparseCalibration.webp differ diff --git a/docs/articles/sparseRoc.png b/docs/articles/sparseRoc.png deleted file mode 100644 index 8a4b13cec..000000000 Binary files a/docs/articles/sparseRoc.png and /dev/null differ diff --git a/docs/articles/sparseRoc.webp b/docs/articles/sparseRoc.webp new file mode 100644 index 000000000..2ea3ea56f Binary files /dev/null and b/docs/articles/sparseRoc.webp differ diff --git a/docs/articles/studydesign.png b/docs/articles/studydesign.png deleted file mode 100644 index 453f4aadd..000000000 Binary files a/docs/articles/studydesign.png and /dev/null differ diff --git a/docs/articles/studydesign.webp b/docs/articles/studydesign.webp new file mode 100644 index 000000000..28717c7d2 Binary files /dev/null and b/docs/articles/studydesign.webp differ diff --git a/docs/articles/variableScatterplot.png b/docs/articles/variableScatterplot.png deleted file mode 100644 index bdcf0df4a..000000000 Binary files a/docs/articles/variableScatterplot.png and /dev/null differ diff --git a/docs/articles/variableScatterplot.webp b/docs/articles/variableScatterplot.webp new file mode 100644 index 000000000..de6f8999d Binary files /dev/null and b/docs/articles/variableScatterplot.webp differ diff --git a/docs/authors.html b/docs/authors.html index 31e2e4ba2..caa14ab0f 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -1,66 +1,12 @@ - - - - - - - -Citation and Authors • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Authors and Citation • PatientLevelPrediction - - + + - - -
-
- -
- -
+
-
-
- +
- - + + diff --git a/docs/index.html b/docs/index.html index d889ddc26..e9e2def0d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5,16 +5,16 @@ -Package for patient level prediction using data in the OMOP Common Data +<title>Developing patient level prediction using data in the OMOP Common Data Model • PatientLevelPrediction - - + +
-
- - -

PatientLevelPrediction is part of HADES.

+
+ +

Build Status

+

codecov.io

+

PatientLevelPrediction is part of HADES.

-
-

-Introduction

+
+

Introduction +

PatientLevelPrediction is an R package for building and validating patient-level predictive models using data in the OMOP Common Data Model format.

-

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

+

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

The figure below illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time.

-

+

To define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented below (T=green, O=red).

-

+

-
-

-Features

+
+

Features +

  • Takes one or more target cohorts (Ts) and one or more outcome cohorts (Os) and develops and validates models for all T and O combinations.
  • Allows for multiple prediction design options.
  • Extracts the necessary data from a database in OMOP Common Data Model format for multiple covariate settings.
  • Uses a large set of covariates including for example all drugs, diagnoses, procedures, as well as age, comorbidity indexes, and custom covariates.
  • -
  • Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network and Deep learning (Convolutional neural networks, Recurrent neural network and Deep nets).
  • +
  • Allows you to add custom covariates or cohort covariates.
  • +
  • Includes a large number of state-of-the-art machine learning algorithms that can be used to develop predictive models, including Regularized logistic regression, Random forest, Gradient boosting machines, Decision tree, Naive Bayes, K-nearest neighbours, Neural network, AdaBoost and Support vector machines.
  • Allows you to add custom algorithms.
  • +
  • Allows you to add custom feature engineering
  • +
  • Allows you to add custom under/over sampling (or any other sampling) [note: based on existing research this is not recommended]
  • Contains functionality to externally validate models.
  • Includes functions to plot and explore model performance (ROC + Calibration).
  • -
  • Includes a shiny app to interactively view and explore results.
  • -
  • Implements existing models.
  • -
  • Builds ensemble models.
  • -
  • Builds Deep Learning models.
  • +
  • Build ensemble models using EnsemblePatientLevelPrediction.
  • +
  • Build Deep Learning models using DeepPatientLevelPrediction.
  • Generates learning curves.
  • -
  • Automatically creates a word document containing all the study results.
  • +
  • Includes a shiny app to interactively view and explore results.
  • +
  • In the shiny app you can create a html file document (report or protocol) containing all the study results.
-
-

-Screenshots

+
+

Screenshots +

@@ -198,81 +215,79 @@

-

Calibration plot

+

Calibration plot

-

ROC plot

+

ROC plot

Demo of the Shiny Apps can be found here:

-
-

-Technology

-

PatientLevelPrediction is an R package, with some functions implemented in C++ and python.

+
+

Technology +

+

PatientLevelPrediction is an R package, with some functions using python through reticulate.

-
-

-System Requirements

-

Requires R (version 3.3.0 or higher). Installation on Windows requires RTools. Libraries used in PatientLevelPrediction require Java and Python.

-

The python installation is required for some of the machine learning algorithms. We advise to install Python 3.7 using Anaconda (https://www.continuum.io/downloads).

+
+

System Requirements +

+

Requires R (version 4.0 or higher). Installation on Windows requires RTools. Libraries used in PatientLevelPrediction require Java and Python.

+

The python installation is required for some of the machine learning algorithms. We advise to install Python 3.8 or higher using Anaconda (https://www.continuum.io/downloads).

-
-

-Getting Started

+ -
-

-User Documentation

-

Documentation can be found on the package website.

+
+

User Documentation +

+

Documentation can be found on the package website.

PDF versions of the documentation are also available, as mentioned above.

-
-

-Support

+
+

Support +

-
-

-Contributing

-

Read here how you can contribute to this package.

+
+

Contributing +

+

Read here how you can contribute to this package.

-
-

-License

+
+

License +

PatientLevelPrediction is licensed under Apache License 2.0

-
-

-Development

+
+

Development +

PatientLevelPrediction is being developed in R Studio.

-

Beta

-
-

-Acknowledgements

+
+

Acknowledgements +

  • The package is maintained by Jenna Reps and Peter Rijnbeek and has been developed with major contributions from Martijn Schuemie, Patrick Ryan, and Marc Suchard.
  • -
  • We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins.
  • +
  • We like to thank the following persons for their contributions to the package: Seng Chan You, Ross Williams, Henrik John, Xiaoyong Pan, James Wiggins, Egill Fridgeirsson, Alex Rekkas
  • This project is supported in part through the National Science Foundation grant IIS 1251151.
@@ -281,56 +296,55 @@

+ + +

-

Site built with pkgdown 1.6.1.

+

+

Site built with pkgdown 2.0.7.

@@ -339,5 +353,7 @@

Dev status

+ + diff --git a/docs/news/index.html b/docs/news/index.html index 366774228..10eee02ad 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -1,66 +1,12 @@ - - - - - - - -Changelog • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Changelog • PatientLevelPrediction - + + - - - -
-
- -
- -
+
-
-

-PatientLevelPrediction 5.0.3

-
    -
  • updated result schema (added model_design table with settings and added attrition table)
  • +
    + +
    • Hotfix adding schema to DatabaseConnector::getTableNames when creating results tables
    • +
    +
    + +
    • Add support for R4.4
    • +
    • Fix notes around documentation (vignette engine and brackets in itemize)
    • +
    • Use webp image format where possible (not in pdfs) for smaller size
    • +
    • Make sure random table names are unique in tests
    • +
    • Remove remote info for Eunomia since it’s in CRAN
    • +
    +
    + +
    • Clean up dependencies, tibble removed and IHT and ParallelLogger from CRAN
    • +
    • Use cohortIds for cohortCovariates to comply with FeatureExtraction
    • +
    • Add cdmDatabaseName from DatabaseDetails to model output
    • +
    • Fix bug when attributes weren’t preserved on trainData$covariateData after split
    • +
    • Fix warnings in tests and speed them up
    • +
    • Fix bug in assignment operator in configurePython
    • +
    • Delay evaluation of plpData when using do.call like in learningCurves and runMultiplePlp
    • +
    • Speed up population generation when subjectId’s are distinct
    • +
    • Fix bug when population was still generated when provided to runPlp
    • +
    +
    + +
    • fix bug with ohdsi shiny modules version check (issue 415)
    • +
    +
    + +
    • Fix sklearnToJson to be compatible with scikit-learn>=1.3
    • +
    • Fix github actions so it’s not hardcoded to use python 3.7
    • +
    +
    + +
    • added spline feature engineering
    • +
    • added age/sex stratified imputation feature engineering
    • +
    • changed result table execution date types to varchar
    • +
    • updated covariateSummary to use feature engineering
    • +
    +
    + +
    • fixed bug introduced with new reticulate update in model saving to json tests
    • +
    +
    + +
    • fixed bug with database insert if result is incomplete
    • +
    • updated/fixed documentation (Egill)
    • +
    • added model path to models (Henrik)
    • +
    • updated hyper-parameter saving to data.frame and made consistent
    • +
    +
    + +
    • fixed bug with multiple covariate settings in diagnose plp
    • +
    • added min cell count when exporting database results to csv files
    • +
    • light GBM added (thanks Jin Choi and Chungsoo Kim)
    • +
    • fixed minor bugs when uploading results to database
    • +
    +
    + +
    • added ensure_installed(“ResultModelManager”) to getDataMigrator()
    • +
    +
    + +
    • shiny app is now using ShinyAppBuilder with a config saved in the /inst folder
    • +
    +
    + +
    • fixed bugs introduced when sklearn inputs changed
    • +
    • added sklearn model being saved as jsons
    • +
    • made changes around the DatabaseConnection get table names function to make it work for the updated DatabaseConnection
    • +
    • removed check RAM stop (now it just warns)
    • +
    +
    + +
    • Updated test to skip test for FE setting if the model does not fit (this was causing occasional test fail)
    • +
    • replaced .data$ with “” for all dplyr::select to remove warnings
    • +
    +
    + +
    • Fix bug with python type being required to be int
    • +
    +
    + +
    • Allow priorType to be passed down to getCV function in case prior is not ‘laplace’
    • +
    • Seed specified in Cyclops model wasn’t passed to Cyclops
    • +
    +
    + +
    • fixed issue with shiny viewer converting connection details to large json
    • +
    +
    + +
    • added check for cdmDatabaseId into createDatabaseDetails
    • +
    • added test for check for cdmDatabaseId into createDatabaseDetails to error when NULL
    • +
    • removed session$onSessionEnded(shiny::stopApp) from shiny server
    • +
    +
    + +
    • fixing cox predictions
    • +
    +
    + +
    • forcing cdmDatabaseId to be a string if integer is input
    • +
    +
    + +
    • replaced utils::read.csv with readr::read_csv when inserting results from csv
    • +
    +
    + +
    • replaced gsub with sub when inserting csvs to database
    • +
    +
    + +
    • saved result specification csv in windows to fix odd formating issue
    • +
    +
    + +
    • fixed sample data bugs
    • +
    • updated to use v1.0.0 of OhdsiShinyModules
    • +
    • updated plp database result tables to use the same structure for cohort and database as other HADES packages
    • +
    • added function to insert csv results into plp database result tables
    • +
    • added input for databaseId (database and version) when extracting data to be consistent with other HADES packages. This is saved in plp objects.
    • +
    +
    + +
    • fixed issue with ‘preprocess’ vs ‘preprocessing’ inconsistently used across models
    • +
    • added metaData tracking for feature engineering or preprocessing when predicting
    • +
    • fixed issue with FE using trainData$covariateData metaData rather than trainData
    • +
    • fixed bug when using sameData for FE
    • +
    +
    + +
    • pulled in multiple bug fixes and test improvements from Egill
    • +
    • pulled in fix for learning curves from Henrik
    • +
    • Pulled in fix for feature engineering from Solomon
    • +
    • Cleaned check messages about comparing class(x) with a string by changing to inherits()
    • +
    +
    + +
    • removed json saving for sklearn models since sklearn-json is no longer working for the latest sklearn
    • +
    +
    + +
    • renamed the input corresponding to the string that gets appended to the results table names to tablePrefix
    • +
    • fixed issues with system.file() from SqlRender code breaking the tests
    • +
    • added an input fileAppend to the function that exports the database tables to csv files
    • +
    • moved the plp model (including preprocessing details) outside of the result database (into a specified folder) due to the size of the objects (too large to insert into the database).
    • +
    +
    + +
    • added saving of plp models into the result database
    • +
    • added default cohortDefinitions in runMultiplePlp
    • +
    +
    + +
    • added modelType to all models for database upload
    • +
    +
    + +
    • moved FeatureExtraction to depends
    • +
    • fixed using inherits()
    • +
    +
    + +
    • moved most of the shiny app code into OhdsiShinyModules
    • +
    • removed shiny dependencies and added OhdsiShinyModules to suggests
    • +
    • fixed bug with linux sklearn saving
    • +
    +
    + +
    • replaced cohortId to targetId for consistency throughout code
    • +
    +
    + +
    • replaced targetId in model design to cohortId for consistency throughout code
    • +
    • replaced plpDataSettings to restrictPlpDataSettings to improve naming consistency
    • +
    • added ability to use initial population in runPlp by adding the population to plpData$population
    • +
    • added splitSettings into modelDesign
    • +
    • replaced saving json settings with ParallelLogger function
    • +
    • updated database result schema (removed researcher_id from tables - if desired a new table with the setting_ids and researcher_id could be added, removed study tables and revised results table to performances table with a reference to model_design_id and development_database_id to enable validation results without a model to be inserted)
    • +
    • added diagnostic code based on PROBAST
    • +
    • added diagnostic shiny module
    • +
    • added code to create sqlite database and populate in uploadToDatabase
    • +
    • add code to convert runPlp+val to sqlite database when viewing shiny
    • +
    • added code to extract database results into csv files: extractDatabaseToCsv()
    • +
    +
    + +
    • pulled in GBM update (default hyper-parameters and variable importance fix) work done by Egill (egillax)
    • +
    +
    + +
    • updated installation documents
    • +
    • added tryCatch around plots to prevent code stopping
    • +
    +
    + +
    • updated result schema (added model_design table with settings and added attrition table)
    • updated shiny app for new database result schema
    • removed C++ code for AUC and Rcpp dependency, now using pROC instead as faster
    • made covariate summary optional when externally validating
    • -
    -
    -
    -

    -PatientLevelPrediction 5.0.2

    -
      -
    • updated json structure for specifying study design (made it friendlier to read)
    • -
    • includes smooth calibration plot fix (done by Alex)
    • +
    +
    + +
    • updated json structure for specifying study design (made it friendlier to read)
    • +
    • includes smooth calibration plot fix - work done by Alex (rekkasa)
    • fixed bug with multiple sample methods or feature engineering settings causing invalid error
    • -
    -
    -
    -

    -PatientLevelPrediction 5.0.0

    -
      -
    • plpModel now saved as json files when possible
    • +
    +
    + +
    • plpModel now saved as json files when possible
    • Updated runPlp to make more modular
    • now possible to customise data splitting, feature engineering, sampling (over/under) and learning algorithm
    • added function for extracting cohort covariates
    • @@ -213,413 +351,256 @@

    • interface to cyclops code revised
    • ensemble learning removed (will be in separate package)
    • deep learning removed (will be in DeepPatientLevelPrediction package)
    • -

    -
    -
    -

    -PatientLevelPrediction 4.4.2

    -
      -
    • revised toSparseM() to do conversion in one go but check RAM availablility beforehand.
    • +
    +
    + +
    • revised toSparseM() to do conversion in one go but check RAM availablility beforehand.
    • removed temporal plpData conversion in toSparseM (this will be done in DeepPatientLevelPrediction)
    • -
    -
    -
    -

    -PatientLevelPrediction 4.4.1

    -
      -
    • shiny can now read csv results
    • +
    +
    + +
    • shiny can now read csv results
    • objects loaded via loadPlpFromCsv() can be saved using savePlpResult()
    • -
    -
    -
    -

    -PatientLevelPrediction 4.4.0

    -
      -
    • added database result storage
    • +
    +
    + +
    • added database result storage
    • added interface to database results in shiny
    • merged in shinyRepo that changed the shiny app to make it modular and added new features
    • removed deep learning as this is being added into new OHDSI package DeepPatientLevelPrediction
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.10

    -
      -
    • save xgboost model as json file for transparency
    • +
    +
    + +
    • save xgboost model as json file for transparency
    • set connectionDetails to NULL in getPlpData
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.9

    -
      -
    • updated andromeda functions - restrict to pop and tidy covs for speed
    • +
    +
    + +
    • updated andromeda functions - restrict to pop and tidy covs for speed
    • quick fix for GBM survival predicting negative values
    • fixed occasional demoSum error for survival models
    • updated index creation to use Andromeda function
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.8

    -
      -
    • fixed bug when normalize data is false
    • +
    +
    + +
    • fixed bug when normalize data is false
    • fixed bugs when single feature (gbm + python)
    • updated GBM
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.7

    -
      -
    • updated calibration slope
    • +
    +
    + +
    • updated calibration slope
    • fixed missing age/gender in prediction
    • fixed shiny intercept bug
    • fixed diagnostic
    • fixed missing covariateSettings in load cvs plp
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.6

    -
      -
    • Removed plpData from evaluation
    • +
    +
    + +
    • Removed plpData from evaluation
    • Added recalibration into externalVal
    • Updated shiny app for recalibration
    • Added population creation setting to use cohortEndDate as timeAtRisk end
    • fixed tests
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.3

    -
      -
    • Reduced imports by adding code to install some dependencies when used
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.2

    -
      -
    • fixed csv result saving bug when no model param
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.1

    -
      -
    • fixed r check vignette issues
    • +
    +
    + +
    • Reduced imports by adding code to install some dependencies when used
    • +
    +
    + +
    • fixed csv result saving bug when no model param
    • +
    +
    + +
    • fixed r check vignette issues
    • added conda install to test
    • -
    -
    -
    -

    -PatientLevelPrediction 4.3.0

    -
      -
    • finalised permutation feature importance
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.10

    -
      -
    • fixed deepNN index issue (reported on github - thanks dapritchard)
    • +
    +
    + +
    • finalised permutation feature importance
    • +
    +
    + +
    • fixed deepNN index issue (reported on github - thanks dapritchard)
    • add compression to python pickles
    • removed requirement to have outcomeCount for prediction with python models
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.9

    -
      -
    • cleaned all checks
    • +
    +
    + +
    • cleaned all checks
    • fixed bug in python toSparseMatrix
    • fixed warning in studyPop
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.8

    -
      -
    • fixed bug (identified by Chungsoo) in covariateSummary
    • +
    +
    + +
    • fixed bug (identified by Chungsoo) in covariateSummary
    • fixed bug with thresholdSummary
    • edited threshold summary function to make it cleaner
    • added to ensemble where you can combine multiple models into an ensemble
    • cleaned up the notes and tests
    • updated simulated data covariateId in tests to use integer64
    • fixed description imports (and sorted them)
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.7

    -
      -
    • fixed Cox model calibration plots
    • +
    +
    + +
    • fixed Cox model calibration plots
    • fixed int64 conversion bug
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.6

    -
      -
    • added baseline risk to Cox model
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.3

    -
      -
    • updated shiny: added attrition and hyper-parameter grid search into settings
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.2

    -
      -
    • updated shiny app added 95% CI to AUC in summary, size is now complete data size and there is a column valPercent that tells what percentage of the data were used for validation
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.1

    -
      -
    • updated GBMsurvival to use survival metrics and c-stat
    • -
    -
    -
    -

    -PatientLevelPrediction 4.2.0

    -
      -
    • added survival metrics
    • -
    -
    -
    -

    -PatientLevelPrediction 4.1.0

    -
      -
    • added updates and fixes into master from development branch
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.6

    -
      -
    • fixed bug with pdw data extraction due to multiple person_id columns
    • +
    +
    + +
    • added baseline risk to Cox model
    • +
    +
    + +
    • updated shiny: added attrition and hyper-parameter grid search into settings
    • +
    +
    + +
    • updated shiny app added 95% CI to AUC in summary, size is now complete data size and there is a column valPercent that tells what percentage of the data were used for validation
    • +
    +
    + +
    • updated GBMsurvival to use survival metrics and c-stat
    • +
    +
    + +
    • added survival metrics
    • +
    +
    + +
    • added updates and fixes into master from development branch
    • +
    +
    + +
    • fixed bug with pdw data extraction due to multiple person_id columns
    • fixed bug in shiny app converting covariate values due to tibble
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.5

    -
      -
    • added calibration updates: cal-in-large, weak cal
    • +
    +
    + +
    • added calibration updates: cal-in-large, weak cal
    • updated smooth cal plot (sample for speed in big data)
    • defaulted to 100 values in calibrationSummary + updated cal plot
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.4

    -
      -
    • fixed backwards compat with normalization
    • +
    +
    + +
    • fixed backwards compat with normalization
    • fixed python joblib dependancy
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.2

    -
      -
    • fixed bug in preprocessing
    • +
    +
    + +
    • fixed bug in preprocessing
    • added cross validation aucs to LR, GBM, RF and MLP
    • added more settings into MLP
    • added threads option in LR
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.1

    -
      -
    • fixed minor bug with shiny dependency
    • +
    +
    + +
    • fixed minor bug with shiny dependency
    • fixed some tests
    • added standardizedMeanDiff to covariatesummary
    • updated createStudyPopulation to make it cleaner to read and count outcome per TAR
    • -
    -
    -
    -

    -PatientLevelPrediction 4.0.0

    -
      -
    • Andromeda replaced ff data objects
    • +
    +
    + +
    • Andromeda replaced ff data objects
    • added age/gender into cohort
    • fixed python warnings
    • updated shiny plp viewer
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.16

    -
      -
    • Fixed bug when running multiple analyses using a data extraction sample with multiple covariate settings
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.15

    -
      -
    • improved shiny PLP viewer
    • +
    +
    + +
    • Fixed bug when running multiple analyses using a data extraction sample with multiple covariate settings
    • +
    +
    + +
    • improved shiny PLP viewer
    • added diagnostic shiny viewer
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.14

    -
      -
    • updated external validate code to enable custom covariates using ATLAS cohorts
    • +
    +
    + +
    • updated external validate code to enable custom covariates using ATLAS cohorts
    • fixed issues with startAnchor and endAnchor
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.13

    -
      -
    • Deprecating addExposureDaysToStart and addExposureDaysToEnd arguments in createStudyPopulation, adding new arguments called startAnchor and endAnchor. The hope is this is less confusing.
    • +
    +
    + +
    • Deprecating addExposureDaysToStart and addExposureDaysToEnd arguments in createStudyPopulation, adding new arguments called startAnchor and endAnchor. The hope is this is less confusing.
    • fixed transfer learning code (can now transfer or fine-tune model)
    • made view plp shiny apps work when some results are missing
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.12

    -
      -
    • set up testing
    • +
    +
    + +
    • set up testing
    • fixed build warnings
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.11

    -
      -
    • added tests to get >70% coverage (keras tests too slow for travis)
    • +
    +
    + +
    • added tests to get >70% coverage (keras tests too slow for travis)
    • Fixed minor bugs
    • Fixed deep learning code and removed pythonInR dependancy
    • combined shiny into one file with one interface
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.10

    -
      -
    • added recalibration using 25% sample in existing models
    • +
    +
    + +
    • added recalibration using 25% sample in existing models
    • added option to provide score to probabilities for existing models
    • fixed warnings with some plots
    • -
    -
    -
    -

    -PatientLevelPrediction 3.0.9

    -

    Small bug fixes:

    -
      -
    • added analysisId into model saving/loading
    • -
    • made external validation saving recursive
    • -
    • added removal of patients with negative TAR when creating population
    • -
    • added option to apply model without preprocessing settings (make them NULL)
    • -
    • updated create study population to remove patients with negative time-at-risk
    • -
    +
+
+ +

Small bug fixes: - added analysisId into model saving/loading - made external validation saving recursive - added removal of patients with negative TAR when creating population - added option to apply model without preprocessing settings (make them NULL) - updated create study population to remove patients with negative time-at-risk

-
-

-PatientLevelPrediction 3.0.8

-

Changes:

-
    -
  • merged in bug fix from Martijn - fixed AUC bug causing crash with big data
  • -
  • update SQL code to be compatible with v6.0 OMOP CDM
  • -
  • added save option to external validate PLP
  • -
+
+ +

Changes: - merged in bug fix from Martijn - fixed AUC bug causing crash with big data - update SQL code to be compatible with v6.0 OMOP CDM - added save option to external validate PLP

-
-

-PatientLevelPrediction 3.0.7

-

Changes:

-
    -
  • Updated splitting functions to include a splitby subject and renamed personSplitter to randomSplitter
  • -
  • Cast indices to integer in python functions to fix bug with non integer sparse matrix indices
  • -
+
+ +

Changes: - Updated splitting functions to include a splitby subject and renamed personSplitter to randomSplitter - Cast indices to integer in python functions to fix bug with non integer sparse matrix indices

-
-

-PatientLevelPrediction 3.0.5

-

Changes:

-
    -
  • Added GLM status to log (will now inform about any fitting issue in log)
  • -
  • Added GBM survival model (still under development)
  • -
  • Added RF quantile regression (still under development)
  • -
  • Updated viewMultiplePlp() to match PLP skeleton package app
  • -
  • Updated single plp vignette with additional example
  • -
  • Merge in deep learning updates from Chan
  • -
+
+ +

Changes: - Added GLM status to log (will now inform about any fitting issue in log) - Added GBM survival model (still under development) - Added RF quantile regression (still under development) - Updated viewMultiplePlp() to match PLP skeleton package app - Updated single plp vignette with additional example - Merge in deep learning updates from Chan

-
-

-PatientLevelPrediction 3.0.4

-

Changes:

-
    -
  • Updated website
  • -
+
+ +

Changes: - Updated website

-
-

-PatientLevelPrediction 3.0.3

-

Changes:

-
    -
  • Added more tests
  • -
  • test files now match R files
  • -
+
+ +

Changes: - Added more tests - test files now match R files

-
-

-PatientLevelPrediction 3.0.2

-

Changes:

-
    -
  • Fixed ensemble stacker
  • -
+
+ +

Changes: - Fixed ensemble stacker

-
-

-PatientLevelPrediction 3.0.1

-

Changes:

-
    -
  • Using reticulate for python interface
  • -
  • Speed improvements
  • -
  • Bug fixes
  • -
+
+ +

Changes: - Using reticulate for python interface - Speed improvements - Bug fixes

+
-
- +
- - + + diff --git a/docs/pkgdown.css b/docs/pkgdown.css index 1273238dd..80ea5b838 100644 --- a/docs/pkgdown.css +++ b/docs/pkgdown.css @@ -56,8 +56,10 @@ img.icon { float: right; } -img { +/* Ensure in-page images don't run outside their container */ +.contents img { max-width: 100%; + height: auto; } /* Fix bug in bootstrap (only seen in firefox) */ @@ -78,11 +80,10 @@ dd { /* Section anchors ---------------------------------*/ a.anchor { - margin-left: -30px; - display:inline-block; - width: 30px; - height: 30px; - visibility: hidden; + display: none; + margin-left: 5px; + width: 20px; + height: 20px; background-image: url(./link.svg); background-repeat: no-repeat; @@ -90,17 +91,15 @@ a.anchor { background-position: center center; } -.hasAnchor:hover a.anchor { - visibility: visible; -} - -@media (max-width: 767px) { - .hasAnchor:hover a.anchor { - visibility: hidden; - } +h1:hover .anchor, +h2:hover .anchor, +h3:hover .anchor, +h4:hover .anchor, +h5:hover .anchor, +h6:hover .anchor { + display: inline-block; } - /* Fixes for fixed navbar --------------------------*/ .contents h1, .contents h2, .contents h3, .contents h4 { @@ -264,31 +263,26 @@ table { /* Syntax highlighting ---------------------------------------------------- */ -pre { - word-wrap: normal; - word-break: normal; - border: 1px solid #eee; -} - -pre, code { +pre, code, pre code { background-color: #f8f8f8; color: #333; } +pre, pre code { + white-space: pre-wrap; + word-break: break-all; + overflow-wrap: break-word; +} -pre code { - overflow: auto; - word-wrap: normal; - white-space: pre; +pre { + border: 1px solid #eee; } -pre .img { +pre .img, pre .r-plt { margin: 5px 0; } -pre .img img { +pre .img img, pre .r-plt img { background-color: #fff; - display: block; - height: auto; } code a, pre a { @@ -305,9 +299,8 @@ a.sourceLine:hover { .kw {color: #264D66;} /* keyword */ .co {color: #888888;} /* comment */ -.message { color: black; font-weight: bolder;} -.error { color: orange; font-weight: bolder;} -.warning { color: #6A0366; font-weight: bolder;} +.error {font-weight: bolder;} +.warning {font-weight: bolder;} /* Clipboard --------------------------*/ @@ -365,3 +358,27 @@ mark { content: ""; } } + +/* Section anchors --------------------------------- + Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 +*/ + +div.csl-bib-body { } +div.csl-entry { + clear: both; +} +.hanging-indent div.csl-entry { + margin-left:2em; + text-indent:-2em; +} +div.csl-left-margin { + min-width:2em; + float:left; +} +div.csl-right-inline { + margin-left:2em; + padding-left:1em; +} +div.csl-indent { + margin-left: 2em; +} diff --git a/docs/pkgdown.js b/docs/pkgdown.js index 7e7048fae..6f0eee40b 100644 --- a/docs/pkgdown.js +++ b/docs/pkgdown.js @@ -80,7 +80,7 @@ $(document).ready(function() { var copyButton = ""; - $(".examples, div.sourceCode").addClass("hasCopyButton"); + $("div.sourceCode").addClass("hasCopyButton"); // Insert copy buttons: $(copyButton).prependTo(".hasCopyButton"); @@ -91,7 +91,7 @@ // Initialize clipboard: var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { text: function(trigger) { - return trigger.parentNode.textContent; + return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); } }); diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 7f8ad5eb7..24865cad6 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,20 +1,20 @@ -pandoc: 2.11.4 -pkgdown: 1.6.1 +pandoc: 3.1.11 +pkgdown: 2.0.7 pkgdown_sha: ~ articles: AddingCustomFeatureEngineering: AddingCustomFeatureEngineering.html AddingCustomModels: AddingCustomModels.html AddingCustomSamples: AddingCustomSamples.html AddingCustomSplitting: AddingCustomSplitting.html + BenchmarkTasks: BenchmarkTasks.html BestPractices: BestPractices.html - BuildingEnsembleModels: BuildingEnsembleModels.html BuildingMultiplePredictiveModels: BuildingMultiplePredictiveModels.html BuildingPredictiveModels: BuildingPredictiveModels.html + ClinicalModels: ClinicalModels.html + ConstrainedPredictors: ConstrainedPredictors.html CreatingLearningCurves: CreatingLearningCurves.html CreatingNetworkStudies: CreatingNetworkStudies.html - CreatingShinyApp: CreatingShinyApp.html InstallationGuide: InstallationGuide.html - PatientLevelPrediction: PatientLevelPrediction.html Videos: Videos.html -last_built: 2022-03-09T19:04Z +last_built: 2024-09-09T14:25Z diff --git a/docs/reference/MapIds.html b/docs/reference/MapIds.html new file mode 100644 index 000000000..5cb0b7906 --- /dev/null +++ b/docs/reference/MapIds.html @@ -0,0 +1,178 @@ + +Map covariate and row Ids so they start from 1 — MapIds • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

this functions takes covariate data and a cohort/population and remaps +the covariate and row ids, restricts to pop and saves/creates mapping

+
+ +
+
MapIds(covariateData, cohort = NULL, mapping = NULL)
+
+ +
+

Arguments

+
covariateData
+

a covariateData object

+ + +
cohort
+

if specified rowIds restricted to the ones in cohort

+ + +
mapping
+

A pre defined mapping to use

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/PatientLevelPrediction.html b/docs/reference/PatientLevelPrediction.html index 635b9807e..77e24885e 100644 --- a/docs/reference/PatientLevelPrediction.html +++ b/docs/reference/PatientLevelPrediction.html @@ -1,67 +1,12 @@ - - - - - - - -PatientLevelPrediction — PatientLevelPrediction • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -PatientLevelPrediction — PatientLevelPrediction • PatientLevelPrediction - - + + - - - -
- +
- - + + diff --git a/docs/reference/accuracy.html b/docs/reference/accuracy.html index 92d9a6902..111a58e86 100644 --- a/docs/reference/accuracy.html +++ b/docs/reference/accuracy.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the accuracy — accuracy • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the accuracy — accuracy • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the accuracy

Calculate the accuracy

-
accuracy(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

accuracy value

-

Details

+
+
accuracy(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

accuracy value

+
+
+

Details

Calculate the accuracy

+
+
-
- +
- - + + diff --git a/docs/reference/addDiagnosePlpToDatabase.html b/docs/reference/addDiagnosePlpToDatabase.html new file mode 100644 index 000000000..31b3a41d3 --- /dev/null +++ b/docs/reference/addDiagnosePlpToDatabase.html @@ -0,0 +1,207 @@ + +Insert a diagnostic result into a PLP result schema database — addDiagnosePlpToDatabase • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function inserts a diagnostic result into the result schema

+
+ +
+
addDiagnosePlpToDatabase(
+  diagnosePlp,
+  connectionDetails,
+  databaseSchemaSettings,
+  cohortDefinitions,
+  databaseList = NULL,
+  overWriteIfExists = T
+)
+
+ +
+

Arguments

+
diagnosePlp
+

An object of class diagnosePlp

+ + +
connectionDetails
+

A connection details created by using the +function createConnectionDetails in the +DatabaseConnector package.

+ + +
databaseSchemaSettings
+

A object created by createDatabaseSchemaSettings with all the settings specifying the result tables

+ + +
cohortDefinitions
+

A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()

+ + +
databaseList
+

(Optional) If you wish to overwrite the settings in the plp object use createdatabaseList to specify the databases

+ + +
overWriteIfExists
+

(default: T) Whether to delete existing results and overwrite them

+ +
+
+

Value

+ + +

Returns NULL but uploads the diagnostic into the database schema specified in databaseSchemaSettings

+
+
+

Details

+

This function can be used to upload a diagnostic result into a database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/addMultipleDiagnosePlpToDatabase.html b/docs/reference/addMultipleDiagnosePlpToDatabase.html new file mode 100644 index 000000000..56ee9c4c8 --- /dev/null +++ b/docs/reference/addMultipleDiagnosePlpToDatabase.html @@ -0,0 +1,202 @@ + +Insert mutliple diagnosePlp results saved to a directory into a PLP result schema database — addMultipleDiagnosePlpToDatabase • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function inserts diagnosePlp results into the result schema

+
+ +
+
addMultipleDiagnosePlpToDatabase(
+  connectionDetails,
+  databaseSchemaSettings,
+  cohortDefinitions,
+  databaseList = NULL,
+  resultLocation
+)
+
+ +
+

Arguments

+
connectionDetails
+

A connection details created by using the +function createConnectionDetails in the +DatabaseConnector package.

+ + +
databaseSchemaSettings
+

A object created by createDatabaseSchemaSettings with all the settings specifying the result tables

+ + +
cohortDefinitions
+

(list) A list of cohortDefinitions (each list must contain: name, id)

+ + +
databaseList
+

(Optional) ...

+ + +
resultLocation
+

The location of the diagnostic results

+ +
+
+

Value

+ + +

Returns NULL but uploads multiple diagnosePlp results into the database schema specified in databaseSchemaSettings

+
+
+

Details

+

This function can be used to upload diagnosePlp results into a database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/addMultipleRunPlpToDatabase.html b/docs/reference/addMultipleRunPlpToDatabase.html new file mode 100644 index 000000000..47b53a4c0 --- /dev/null +++ b/docs/reference/addMultipleRunPlpToDatabase.html @@ -0,0 +1,212 @@ + +Populate the PatientLevelPrediction results tables — addMultipleRunPlpToDatabase • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function formats and uploads results that have been generated via an ATLAS prediction package into a database

+
+ +
+
addMultipleRunPlpToDatabase(
+  connectionDetails,
+  databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+  cohortDefinitions,
+  databaseList = NULL,
+  resultLocation = NULL,
+  resultLocationVector,
+  modelSaveLocation
+)
+
+ +
+

Arguments

+
connectionDetails
+

A connection details created by using the +function createConnectionDetails in the +DatabaseConnector package.

+ + +
databaseSchemaSettings
+

A object created by createDatabaseSchemaSettings with all the settings specifying the result tables

+ + +
cohortDefinitions
+

A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()

+ + +
databaseList
+

(Optional) A list created by createDatabaseList to specify the databases

+ + +
resultLocation
+

(string) location of directory where the main package results were saved

+ + +
resultLocationVector
+

(only used when resultLocation is missing) a vector of locations with development or validation results

+ + +
modelSaveLocation
+

The location of the file system for saving the models in a subdirectory

+ +
+
+

Value

+ + +

Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema

+
+
+

Details

+

This function can be used upload PatientLevelPrediction results into a database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/addRecalibration.html b/docs/reference/addRecalibration.html deleted file mode 100644 index 2b920805e..000000000 --- a/docs/reference/addRecalibration.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - - -addRecalibration — addRecalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Adds the recalibration results to the main results

-
- -
addRecalibration(performanceEvaluation, recalibration)
- -

Arguments

- - - - - - - - - - -
performanceEvaluation

The main result performanceEvaluation

recalibration

The recalibration result

- -

Value

- -

An object of class runPlp that is recalibrated on the new data

-

Details

- -

Append the recalibration results into the main results

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/addRunPlpToDatabase.html b/docs/reference/addRunPlpToDatabase.html new file mode 100644 index 000000000..9752d3ec5 --- /dev/null +++ b/docs/reference/addRunPlpToDatabase.html @@ -0,0 +1,207 @@ + +Function to add the run plp (development or validation) to database — addRunPlpToDatabase • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function adds a runPlp or external validation result into a database

+
+ +
+
addRunPlpToDatabase(
+  runPlp,
+  connectionDetails,
+  databaseSchemaSettings,
+  cohortDefinitions,
+  modelSaveLocation,
+  databaseList = NULL
+)
+
+ +
+

Arguments

+
runPlp
+

An object of class runPlp or class externalValidatePlp

+ + +
connectionDetails
+

A connection details created by using the +function createConnectionDetails in the +DatabaseConnector package.

+ + +
databaseSchemaSettings
+

A object created by createDatabaseSchemaSettings with all the settings specifying the result tables

+ + +
cohortDefinitions
+

A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()

+ + +
modelSaveLocation
+

The location of the directory that models will be saved to

+ + +
databaseList
+

(Optional) If you want to change the database name then used createDatabaseList to specify the database settings but use the same cdmDatabaseId was model development/validation

+ +
+
+

Value

+ + +

Returns a data.frame with the database details

+
+
+

Details

+

This function is used when inserting results into the PatientLevelPrediction database results schema

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/applyEnsembleModel.html b/docs/reference/applyEnsembleModel.html deleted file mode 100644 index cc9346c8e..000000000 --- a/docs/reference/applyEnsembleModel.html +++ /dev/null @@ -1,280 +0,0 @@ - - - - - - - - -Apply trained ensemble model on new data Apply a Patient Level Prediction model on Patient Level -Prediction Data and get the predicted risk in [0,1] for each person in the population. If the user -inputs a population with an outcomeCount column then the function also returns the evaluation of -the prediction (AUC, brier score, calibration) — applyEnsembleModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Apply trained ensemble model on new data Apply a Patient Level Prediction model on Patient Level -Prediction Data and get the predicted risk in [0,1] for each person in the population. If the user -inputs a population with an outcomeCount column then the function also returns the evaluation of -the prediction (AUC, brier score, calibration)

-
- -
applyEnsembleModel(
-  population,
-  dataList,
-  ensembleModel,
-  analysisId = NULL,
-  calculatePerformance = T
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
population

The population of people who you want to predict the risk for

dataList

The plpData list for the population

ensembleModel

The trained ensemble model returned by running runEnsembleModel

analysisId

The analysis ID, which is the ID of running ensemble model training.

calculatePerformance

Whether to also calculate the performance metrics [default TRUE]

- - -

Examples

-
if (FALSE) { -# load the model and data -plpData <- loadPlpData("plpdata/") -results <- PatientLevelPrediction::runEnsembleModel(population, - dataList = list(plpData, plpData), - modelList = list(model, model), - testSplit = "person", - testFraction = 0.2, - nfold = 3, - splitSeed = 1000, - ensembleStrategy = "stacked") -# use the same population settings as the model: -populationSettings <- plpModel$populationSettings -populationSettings$plpData <- plpData -population <- do.call(createStudyPopulation, populationSettings) - -# get the prediction, please make sure the ensemble strategy for training and apply is the same: -prediction <- applyEnsembleModel(population, - dataList = list(plpData, plpData), - ensembleModel = results, - analysisId = NULL)$prediction -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/applyModel.html b/docs/reference/applyModel.html deleted file mode 100644 index 2b5feb7af..000000000 --- a/docs/reference/applyModel.html +++ /dev/null @@ -1,281 +0,0 @@ - - - - - - - - -Apply train model on new data -Apply a Patient Level Prediction model on Patient Level Prediction Data and get the predicted risk -in [0,1] for each person in the population. If the user inputs a population with an outcomeCount -column then the function also returns the evaluation of the prediction (AUC, brier score, -calibration) — applyModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Apply train model on new data -Apply a Patient Level Prediction model on Patient Level Prediction Data and get the predicted risk -in [0,1] for each person in the population. If the user inputs a population with an outcomeCount -column then the function also returns the evaluation of the prediction (AUC, brier score, -calibration)

-
- -
applyModel(
-  population,
-  plpData,
-  plpModel,
-  calculatePerformance = T,
-  databaseOutput = NULL,
-  silent = F
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
population

The population of people who you want to predict the risk for

plpData

The plpData for the population

plpModel

The trained PatientLevelPrediction model

calculatePerformance

Whether to also calculate the performance metrics [default TRUE]

databaseOutput

Whether to save the details into the prediction database

silent

Whether to turn off progress reporting

- - -

Examples

-
if (FALSE) { -# load the model and data -plpData <- loadPlpData("C:/plpdata") -plpModel <- loadPlpModel("C:/plpmodel") - -# use the same population settings as the model: -populationSettings <- plpModel$populationSettings -populationSettings$plpData <- plpData -population <- do.call(createStudyPopulation, populationSettings) - -# get the prediction: -prediction <- applyModel(population, plpData, plpModel)$prediction -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/averagePrecision.html b/docs/reference/averagePrecision.html index 35065ca7d..658174496 100644 --- a/docs/reference/averagePrecision.html +++ b/docs/reference/averagePrecision.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the average precision — averagePrecision • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the average precision — averagePrecision • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,49 +131,48 @@

Calculate the average precision

Calculate the average precision

-
averagePrecision(prediction)
- -

Arguments

- - - - - - -
prediction

A prediction object

+
+
averagePrecision(prediction)
+
-

Value

+
+

Arguments

+
prediction
+

A prediction object

-

The average precision

-

Details

+
+
+

Value

+ +

The average precision

+
+
+

Details

Calculates the average precision from a predition object

+
+
-
- +
- - + + diff --git a/docs/reference/brierScore.html b/docs/reference/brierScore.html index d3606f9bf..678bf1efe 100644 --- a/docs/reference/brierScore.html +++ b/docs/reference/brierScore.html @@ -1,67 +1,12 @@ - - - - - - - -brierScore — brierScore • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -brierScore — brierScore • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,49 +131,48 @@

brierScore

brierScore

-
brierScore(prediction)
- -

Arguments

- - - - - - -
prediction

A prediction object

+
+
brierScore(prediction)
+
-

Value

+
+

Arguments

+
prediction
+

A prediction object

-

A list containing the brier score and the scaled brier score

-

Details

+
+
+

Value

+ +

A list containing the brier score and the scaled brier score

+
+
+

Details

Calculates the brierScore from prediction object

+
+
-
- +
- - + + diff --git a/docs/reference/bySumFf.html b/docs/reference/bySumFf.html deleted file mode 100644 index 87ca32289..000000000 --- a/docs/reference/bySumFf.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - -Compute sum of values binned by a second variable — bySumFf • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Compute sum of values binned by a second variable

- -
- -
bySumFf(values, bins)
- -

Arguments

- - - - - - - - - - -
values

An ff object containing the numeric values to be summed

bins

An ff object containing the numeric values to bin by

- - -

Examples

-
values <- ff::as.ff(c(1, 1, 2, 2, 1)) -bins <- ff::as.ff(c(1, 1, 1, 2, 2)) -bySumFf(values, bins)
#> bins sums -#> 1 1 4 -#> 2 2 3
-
-
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/calibrationLine.html b/docs/reference/calibrationLine.html index e0f6267d5..425cccd2b 100644 --- a/docs/reference/calibrationLine.html +++ b/docs/reference/calibrationLine.html @@ -1,67 +1,12 @@ - - - - - - - -calibrationLine — calibrationLine • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -calibrationLine — calibrationLine • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,50 +131,46 @@

calibrationLine

calibrationLine

-
calibrationLine(prediction, numberOfStrata = 10)
+
+
calibrationLine(prediction, numberOfStrata = 10)
+
+ +
+

Arguments

+
prediction
+

A prediction object

-

Arguments

- - - - - - - - - - -
prediction

A prediction object

numberOfStrata

The number of groups to split the prediction into

-

Details

+
numberOfStrata
+

The number of groups to split the prediction into

+
+
+

Details

Calculates the calibration from prediction object

+
+
-
- +
- - + + diff --git a/docs/reference/checkPlpInstallation.html b/docs/reference/checkPlpInstallation.html deleted file mode 100644 index c9d6dee27..000000000 --- a/docs/reference/checkPlpInstallation.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - - -Check PatientLevelPrediction and its dependencies are correctly installed — checkPlpInstallation • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Check PatientLevelPrediction and its dependencies are correctly installed

-
- -
checkPlpInstallation(connectionDetails = NULL, python = T)
- -

Arguments

- - - - - - - - - - -
connectionDetails

An R object of type
connectionDetails created using the function -createConnectionDetails in the DatabaseConnector package.

python

Whether to test the python models

- -

Details

- -

This function checks whether PatientLevelPrediction and its dependencies are correctly installed. This will -check the database connectivity, some models, and large data object -handling (ff).

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/checkffFolder.html b/docs/reference/checkffFolder.html deleted file mode 100644 index 72bfe06df..000000000 --- a/docs/reference/checkffFolder.html +++ /dev/null @@ -1,198 +0,0 @@ - - - - - - - - -Check if the fftempdir is writable — checkffFolder • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Check if the fftempdir is writable

- -
- -
checkffFolder()
- -

Details

- -

This function checks whether the fftempdir is writable. -If not, it will ask the use to specify a writable folder.

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/clearffTempDir.html b/docs/reference/clearffTempDir.html deleted file mode 100644 index a742cfd80..000000000 --- a/docs/reference/clearffTempDir.html +++ /dev/null @@ -1,191 +0,0 @@ - - - - - - - - -clearffTempDir — clearffTempDir • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Clears the temporary ff directory to free up disk space.

- -
- -
clearffTempDir()
- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/combinePlpModelSettings.html b/docs/reference/combinePlpModelSettings.html deleted file mode 100644 index 471203a99..000000000 --- a/docs/reference/combinePlpModelSettings.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -combine two objects specifying multiple Plp model settings — combinePlpModelSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

combine two objects specifying multiple Plp model settings

-
- -
combinePlpModelSettings(plpModelSetting1, plpModelSetting2)
- -

Arguments

- - - - - - - - - - -
plpModelSetting1

A combination of model, covariate and population settings

plpModelSetting2

A combination of model, covariate and population settings

- -

Value

- -

A list containing a dataframe settingLookupTable containing all the model, covariate and popualtion combination details, -a list models containing all the model settings, a list covariateSettings containing all the covariate settings and a list -populationSettings containing all the population settings.

-

Details

- -

Takes two output of running createPlpModelSettings() and combined them

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/computeAuc.html b/docs/reference/computeAuc.html index 69cdd4b86..9c79d5ffb 100644 --- a/docs/reference/computeAuc.html +++ b/docs/reference/computeAuc.html @@ -1,67 +1,12 @@ - - - - - - - -Compute the area under the ROC curve — computeAuc • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Compute the area under the ROC curve — computeAuc • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,52 +131,48 @@

Compute the area under the ROC curve

Compute the area under the ROC curve

-
computeAuc(prediction, confidenceInterval = FALSE)
+
+
computeAuc(prediction, confidenceInterval = FALSE)
+
+ +
+

Arguments

+
prediction
+

A prediction object as generated using the +predict functions.

-

Arguments

- - - - - - - - - - -
prediction

A prediction object as generated using the -predict functions.

confidenceInterval

Should 95 percebt confidence intervals be computed?

-

Details

+
confidenceInterval
+

Should 95 percebt confidence intervals be computed?

+
+
+

Details

Computes the area under the ROC curve for the predicted probabilities, given the true observed outcomes.

+
+
-
- +
- - + + diff --git a/docs/reference/computeAucFromDataFrames.html b/docs/reference/computeAucFromDataFrames.html deleted file mode 100644 index 0cbce670f..000000000 --- a/docs/reference/computeAucFromDataFrames.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Compute the area under the ROC curve — computeAucFromDataFrames • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Compute the area under the ROC curve

-
- -
computeAucFromDataFrames(
-  prediction,
-  status,
-  time = NULL,
-  confidenceInterval = FALSE,
-  timePoint,
-  modelType = "logistic"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
prediction

A vector with the predicted hazard rate.

status

A vector with the status of 1 (event) or 0 (no event).

time

Only for survival models: a vector with the time to event or censor -(which ever comes first).

confidenceInterval

Should 95 percebt confidence intervals be computed?

timePoint

Only for survival models: time point when the AUC should be evaluated

modelType

Type of model. Currently supported are "logistic" and "survival".

- -

Details

- -

Computes the area under the ROC curve for the predicted probabilities, given the true observed -outcomes.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/computeGridPerformance.html b/docs/reference/computeGridPerformance.html new file mode 100644 index 000000000..5865fa6ea --- /dev/null +++ b/docs/reference/computeGridPerformance.html @@ -0,0 +1,183 @@ + +Computes grid performance with a specified performance function — computeGridPerformance • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Computes grid performance with a specified performance function

+
+ +
+
computeGridPerformance(prediction, param, performanceFunct = "computeAuc")
+
+ +
+

Arguments

+
prediction
+

a dataframe with predictions and outcomeCount per rowId

+ + +
param
+

a list of hyperparameters

+ + +
performanceFunct
+

a string specifying which performance function to use +. Default ``'compute_AUC'``

+ +
+
+

Value

+ + +

A list with overview of the performance

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/configurePython.html b/docs/reference/configurePython.html index 4ddcfe043..1abf63794 100644 --- a/docs/reference/configurePython.html +++ b/docs/reference/configurePython.html @@ -1,67 +1,12 @@ - - - - - - - -Sets up a virtual environment to use for PLP (can be conda or python) — configurePython • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Sets up a virtual environment to use for PLP (can be conda or python) — configurePython • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,51 +131,51 @@

Sets up a virtual environment to use for PLP (can be conda or python)

Sets up a virtual environment to use for PLP (can be conda or python)

-
configurePython(envname = "PLP", envtype = NULL)
+
+
configurePython(envname = "PLP", envtype = NULL, condaPythonVersion = "3.11")
+
+ +
+

Arguments

+
envname
+

A string for the name of the virtual environment (default is 'PLP')

-

Arguments

- - - - - - - - - - -
envname

A string for the name of the virtual environment (default is 'PLP')

envtype

An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users

-

Details

+
envtype
+

An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users

+ +
condaPythonVersion
+

String, Python version to use when creating a conda environment

+ +
+
+

Details

This function creates a virtual environment that can be used by PatientLevelPrediction and installs all the required package dependancies. If using python, pip must be set up.

+
+
-
- +
- - + + diff --git a/docs/reference/covariateSummary.html b/docs/reference/covariateSummary.html index 5d58c34fa..0ca6ee4c9 100644 --- a/docs/reference/covariateSummary.html +++ b/docs/reference/covariateSummary.html @@ -1,70 +1,15 @@ - - - - - - - -covariateSummary — covariateSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -covariateSummary — covariateSummary • PatientLevelPrediction - - - - - - - - - - - + + - - -
-
- -
- -
+
@@ -189,81 +137,80 @@

covariateSummary

by train and test set

-
covariateSummary(
-  covariateData,
-  cohort,
-  labels = NULL,
-  strata = NULL,
-  variableImportance = NULL,
-  featureEngineering = NULL
-)
+
+
covariateSummary(
+  covariateData,
+  cohort,
+  labels = NULL,
+  strata = NULL,
+  variableImportance = NULL,
+  featureEngineering = NULL
+)
+
+ +
+

Arguments

+
covariateData
+

The covariateData part of the plpData that is +extracted using getPlpData

+ -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
covariateData

The covariateData part of the plpData that is -extracted using getPlpData

cohort

The patient cohort to calculate the summary

labels

A data.frame with the columns rowId and outcomeCount

strata

A data.frame containing the columns rowId, strataName

variableImportance

A data.frame with the columns covariateId and -value (the variable importance value)

featureEngineering

(currently not used ) +

cohort
+

The patient cohort to calculate the summary

+ + +
labels
+

A data.frame with the columns rowId and outcomeCount

+ + +
strata
+

A data.frame containing the columns rowId, strataName

+ + +
variableImportance
+

A data.frame with the columns covariateId and +value (the variable importance value)

+ + +
featureEngineering
+

(currently not used ) A function or list of functions specifying any feature engineering -to create covariates before summarising

+to create covariates before summarising

-

Value

+
+
+

Value

+ -

A data.frame containing: CovariateCount CovariateMean and CovariateStDev plus these values +

A data.frame containing: CovariateCount CovariateMean and CovariateStDev plus these values for any specified stratification

-

Details

- +
+
+

Details

The function calculates various metrics to measure the performance of the model

+
+
-
- +
- - + + diff --git a/docs/reference/createCohortCovariateSettings.html b/docs/reference/createCohortCovariateSettings.html index df10200a9..1f4bc8bc9 100644 --- a/docs/reference/createCohortCovariateSettings.html +++ b/docs/reference/createCohortCovariateSettings.html @@ -1,67 +1,12 @@ - - - - - - - -Extracts covariates based on cohorts — createCohortCovariateSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Extracts covariates based on cohorts — createCohortCovariateSettings • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,104 +131,103 @@

Extracts covariates based on cohorts

Extracts covariates based on cohorts

-
createCohortCovariateSettings(
-  cohortName,
-  settingId,
-  cohortDatabaseSchema,
-  cohortTable,
-  cohortId,
-  startDay = -30,
-  endDay = 0,
-  count = F,
-  ageInteraction = F,
-  lnAgeInteraction = F,
-  analysisId = 456
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cohortName

Name for the cohort

settingId

A unique id for the covariate time and

cohortDatabaseSchema

The schema of the database with the cohort

cohortTable

the table name that contains the covariate cohort

cohortId

cohort id for the covariate cohort

startDay

The number of days prior to index to start observing the cohort

endDay

The number of days prior to index to stop observing the cohort

count

If FALSE the covariate value is binary (1 means cohort occurred between index+startDay and index+endDay, 0 means it did not) -If TRUE then the covariate value is the number of unique cohort_start_dates between index+startDay and index+endDay

ageInteraction

If TRUE multiple covariate value by the patient's age in years

lnAgeInteraction

If TRUE multiple covariate value by the log of the patient's age in years

analysisId

The analysisId for the covariate

- -

Value

- -

An object of class covariateSettings specifying how to create the cohort covariate with the covariateId - cohortId x 100000 + settingId x 1000 + analysisId

-

Details

+
+
createCohortCovariateSettings(
+  cohortName,
+  settingId,
+  cohortDatabaseSchema,
+  cohortTable,
+  cohortId,
+  startDay = -30,
+  endDay = 0,
+  count = F,
+  ageInteraction = F,
+  lnAgeInteraction = F,
+  analysisId = 456
+)
+
+ +
+

Arguments

+
cohortName
+

Name for the cohort

+ + +
settingId
+

A unique id for the covariate time and

+ + +
cohortDatabaseSchema
+

The schema of the database with the cohort

+ + +
cohortTable
+

the table name that contains the covariate cohort

+ + +
cohortId
+

cohort id for the covariate cohort

+ + +
startDay
+

The number of days prior to index to start observing the cohort

+ +
endDay
+

The number of days prior to index to stop observing the cohort

+ + +
count
+

If FALSE the covariate value is binary (1 means cohort occurred between index+startDay and index+endDay, 0 means it did not) +If TRUE then the covariate value is the number of unique cohort_start_dates between index+startDay and index+endDay

+ + +
ageInteraction
+

If TRUE multiple covariate value by the patient's age in years

+ + +
lnAgeInteraction
+

If TRUE multiple covariate value by the log of the patient's age in years

+ + +
analysisId
+

The analysisId for the covariate

+ +
+
+

Value

+ + +

An object of class covariateSettings specifying how to create the cohort covariate with the covariateId + cohortId x 100000 + settingId x 1000 + analysisId

+
+
+

Details

The user specifies a cohort and time period and then a covariate is constructed whether they are in the cohort during the time periods relative to target population cohort index

+
+
-
- +
- - + + diff --git a/docs/reference/createDatabaseDetails.html b/docs/reference/createDatabaseDetails.html index bac71f798..74b2f2d03 100644 --- a/docs/reference/createDatabaseDetails.html +++ b/docs/reference/createDatabaseDetails.html @@ -1,67 +1,12 @@ - - - - - - - -Create a setting that holds the details about the cdmDatabase connection for data extraction — createDatabaseDetails • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create a setting that holds the details about the cdmDatabase connection for data extraction — createDatabaseDetails • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,118 +131,126 @@

Create a setting that holds the details about the cdmDatabase connection for

Create a setting that holds the details about the cdmDatabase connection for data extraction

-
createDatabaseDetails(
-  connectionDetails,
-  cdmDatabaseSchema,
-  cdmDatabaseName,
-  tempEmulationSchema = cdmDatabaseSchema,
-  cohortDatabaseSchema = cdmDatabaseSchema,
-  cohortTable = "cohort",
-  outcomeDatabaseSchema = cdmDatabaseSchema,
-  outcomeTable = "cohort",
-  cohortId = NULL,
-  outcomeIds = NULL,
-  cdmVersion = 5
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
connectionDetails

An R object of type connectionDetails created using the +

+
createDatabaseDetails(
+  connectionDetails,
+  cdmDatabaseSchema,
+  cdmDatabaseName,
+  cdmDatabaseId,
+  tempEmulationSchema = cdmDatabaseSchema,
+  cohortDatabaseSchema = cdmDatabaseSchema,
+  cohortTable = "cohort",
+  outcomeDatabaseSchema = cdmDatabaseSchema,
+  outcomeTable = "cohort",
+  targetId = NULL,
+  outcomeIds = NULL,
+  cdmVersion = 5,
+  cohortId = NULL
+)
+
+ +
+

Arguments

+
connectionDetails
+

An R object of type connectionDetails created using the function createConnectionDetails in the -DatabaseConnector package.

cdmDatabaseSchema

The name of the database schema that contains the OMOP CDM +DatabaseConnector package.

+ + +
cdmDatabaseSchema
+

The name of the database schema that contains the OMOP CDM instance. Requires read permissions to this database. On SQL Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

cdmDatabaseName

A string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported)

tempEmulationSchema

For dmbs like Oracle only: the name of the database schema where you +so for example 'cdm_instance.dbo'.

+ + +
cdmDatabaseName
+

A string with the name of the database - this is used in the shiny app and when externally validating models to name the result list and to specify the folder name when saving validation results (defaults to cdmDatabaseSchema if not specified)

+ + +
cdmDatabaseId
+

A string with a unique identifier for the database and version - this is stored in the plp object for future reference and used by the shiny app (defaults to cdmDatabaseSchema if not specified)

+ + +
tempEmulationSchema
+

For dmbs like Oracle only: the name of the database schema where you want all temporary tables to be managed. Requires -create/insert permissions to this database.

cohortDatabaseSchema

The name of the database schema that is the location where the +create/insert permissions to this database.

+ + +
cohortDatabaseSchema
+

The name of the database schema that is the location where the target cohorts are available. Requires read -permissions to this database.

cohortTable

The tablename that contains the target cohorts. Expectation is cohortTable +permissions to this database.

+ + +
cohortTable
+

The tablename that contains the target cohorts. Expectation is cohortTable has format of COHORT table: COHORT_DEFINITION_ID, SUBJECT_ID, -COHORT_START_DATE, COHORT_END_DATE.

outcomeDatabaseSchema

The name of the database schema that is the location where the +COHORT_START_DATE, COHORT_END_DATE.

+ + +
outcomeDatabaseSchema
+

The name of the database schema that is the location where the data used to define the outcome cohorts is available. Requires read permissions to -this database.

outcomeTable

The tablename that contains the outcome cohorts. Expectation is +this database.

+ + +
outcomeTable
+

The tablename that contains the outcome cohorts. Expectation is outcomeTable has format of COHORT table: COHORT_DEFINITION_ID, -SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE.

cohortId

An integer specifying the cohort id for the target cohort

outcomeIds

A single integer or vector of integers specifying the cohort ids for the outcome cohorts

cdmVersion

Define the OMOP CDM version used: currently support "4" and -"5".

- -

Value

- -

A list with the the database specific settings (this is used by the runMultiplePlp function and the skeleton packages)

-

Details

+SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE.

+ + +
targetId
+

An integer specifying the cohort id for the target cohort

+ + +
outcomeIds
+

A single integer or vector of integers specifying the cohort ids for the outcome cohorts

+ +
cdmVersion
+

Define the OMOP CDM version used: currently support "4" and "5".

+ + +
cohortId
+

(depreciated: use targetId) old input for the target cohort id

+ +
+
+

Value

+ + +

A list with the the database specific settings (this is used by the runMultiplePlp function and the skeleton packages)

+
+
+

Details

This function simply stores the settings for communicating with the cdmDatabase when extracting the target cohort and outcomes

+
+
-
- +
- - + + diff --git a/docs/reference/createDatabaseList.html b/docs/reference/createDatabaseList.html new file mode 100644 index 000000000..47e5f277b --- /dev/null +++ b/docs/reference/createDatabaseList.html @@ -0,0 +1,186 @@ + +Create a list with the database details and database meta data entries — createDatabaseList • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function creates a list with the database details and database meta data entries used in the study

+
+ +
+
createDatabaseList(cdmDatabaseSchemas, cdmDatabaseNames, databaseRefIds = NULL)
+
+ +
+

Arguments

+
cdmDatabaseSchemas
+

(string vector) A vector of the cdmDatabaseSchemas used in the study - if the schemas are not unique per database please also specify databaseRefId

+ + +
cdmDatabaseNames
+

Sharable names for the databases

+ + +
databaseRefIds
+

(string vector) Unique database identifiers - what you specified as cdmDatabaseId in PatientLevelPrediction::createDatabaseDetails() when developing the models

+ +
+
+

Value

+ + +

Returns a data.frame with the database details

+
+
+

Details

+

This function is used when inserting database details into the PatientLevelPrediction database results schema

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createDatabaseSchemaSettings.html b/docs/reference/createDatabaseSchemaSettings.html new file mode 100644 index 000000000..d5486f118 --- /dev/null +++ b/docs/reference/createDatabaseSchemaSettings.html @@ -0,0 +1,215 @@ + +Create the PatientLevelPrediction database result schema settings — createDatabaseSchemaSettings • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function specifies where the results schema is and lets you pick a different schema for the cohorts and databases

+
+ +
+
createDatabaseSchemaSettings(
+  resultSchema = "main",
+  tablePrefix = "",
+  targetDialect = "sqlite",
+  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+  cohortDefinitionSchema = resultSchema,
+  tablePrefixCohortDefinitionTables = tablePrefix,
+  databaseDefinitionSchema = resultSchema,
+  tablePrefixDatabaseDefinitionTables = tablePrefix
+)
+
+ +
+

Arguments

+
resultSchema
+

(string) The name of the database schema with the result tables.

+ + +
tablePrefix
+

(string) A string that appends to the PatientLevelPrediction result tables

+ + +
targetDialect
+

(string) The database management system being used

+ + +
tempEmulationSchema
+

(string) The temp schema used when the database management system is oracle

+ + +
cohortDefinitionSchema
+

(string) The name of the database schema with the cohort definition tables (defaults to resultSchema).

+ + +
tablePrefixCohortDefinitionTables
+

(string) A string that appends to the cohort definition tables

+ + +
databaseDefinitionSchema
+

(string) The name of the database schema with the database definition tables (defaults to resultSchema).

+ + +
tablePrefixDatabaseDefinitionTables
+

(string) A string that appends to the database definition tables

+ +
+
+

Value

+ + +

Returns a list of class 'plpDatabaseResultSchema' with all the database settings

+
+
+

Details

+

This function can be used to specify the database settings used to upload PatientLevelPrediction results into a database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createDefaultExecuteSettings.html b/docs/reference/createDefaultExecuteSettings.html index 0b51578f2..e4e409d17 100644 --- a/docs/reference/createDefaultExecuteSettings.html +++ b/docs/reference/createDefaultExecuteSettings.html @@ -1,67 +1,12 @@ - - - - - - - -Creates default list of settings specifying what parts of runPlp to execute — createDefaultExecuteSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Creates default list of settings specifying what parts of runPlp to execute — createDefaultExecuteSettings • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,41 +131,42 @@

Creates default list of settings specifying what parts of runPlp to execute<

Creates default list of settings specifying what parts of runPlp to execute

-
createDefaultExecuteSettings()
- - -

Value

+
+
createDefaultExecuteSettings()
+
-

list with TRUE for split, preprocess, model development and covariate summary

-

Details

+
+

Value

+ +

list with TRUE for split, preprocess, model development and covariate summary

+
+
+

Details

runs split, preprocess, model development and covariate summary

+
+
-
- +
- - + + diff --git a/docs/reference/createDefaultSplitSetting.html b/docs/reference/createDefaultSplitSetting.html index 3e9229228..f92e40aee 100644 --- a/docs/reference/createDefaultSplitSetting.html +++ b/docs/reference/createDefaultSplitSetting.html @@ -1,70 +1,15 @@ - - - - - - - -Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting). — createDefaultSplitSetting • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for defining how the plpData are split into test/validation/train sets using +default splitting functions (either random stratified by outcome, time or subject splitting) — createDefaultSplitSetting • PatientLevelPrediction - + + - - - -
-
- -
- -
+

+ Source: R/DataSplitting.R

Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting).

+default splitting functions (either random stratified by outcome, time or subject splitting)

+
+ +
+
createDefaultSplitSetting(
+  testFraction = 0.25,
+  trainFraction = 0.75,
+  splitSeed = sample(1e+05, 1),
+  nfold = 3,
+  type = "stratified"
+)
-
createDefaultSplitSetting(
-  testFraction = 0.25,
-  trainFraction = 0.75,
-  splitSeed = sample(1e+05, 1),
-  nfold = 3,
-  type = "stratified"
-)
+
+

Arguments

+
testFraction
+

(numeric) A real number between 0 and 1 indicating the test set fraction of the data

+ + +
trainFraction
+

(numeric) A real number between 0 and 1 indicating the train set fraction of the data. +If not set train is equal to 1 - test

+ + +
splitSeed
+

(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)

+ + +
nfold
+

(numeric) An integer > 1 specifying the number of folds used in cross validation

-

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
testFraction

(numeric) A real number between 0 and 1 indicating the test set fraction of the data

trainFraction

(numeric) A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

splitSeed

(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)

nfold

(numeric) An integer > 1 specifying the number of folds used in cross validation

type

(character) Choice of:

    -
  • 'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition

  • -
  • 'time') Older data are assigned into the training set and newer data are assigned into the test set

  • + +
    type
    +

    (character) Choice of:

    • 'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition

    • +
    • 'time' Older data are assigned into the training set and newer data are assigned into the test set

    • 'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).

    • -
+ + +
+
+

Value

+ -

Value

+

An object of class splitSettings

-

An object of class splitSettings

-

Details

+
+
+

Details

Returns an object of class splitSettings that specifies the splitting function that will be called and the settings

+
+
-
- +
- - + + diff --git a/docs/reference/createEnsemble.html b/docs/reference/createEnsemble.html deleted file mode 100644 index 6b66f293b..000000000 --- a/docs/reference/createEnsemble.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - - -Combine models into an Ensemble — createEnsemble • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Combine models into an Ensemble

-
- -
createEnsemble(runPlpList, weighted = F, weights = NULL)
- -

Arguments

- - - - - - - - - - - - - - -
runPlpList

The runPlp results for the different models to combine

weighted

If F then mean across models is used, if T must input weights or AUC weighting is used

weights

A vector of length(runPlpList) with the weights to assign each model

- - -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/createExecuteSettings.html b/docs/reference/createExecuteSettings.html index 02209dc22..dfb91aeed 100644 --- a/docs/reference/createExecuteSettings.html +++ b/docs/reference/createExecuteSettings.html @@ -1,67 +1,12 @@ - - - - - - - -Creates list of settings specifying what parts of runPlp to execute — createExecuteSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Creates list of settings specifying what parts of runPlp to execute — createExecuteSettings • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,76 +131,75 @@

Creates list of settings specifying what parts of runPlp to execute

Creates list of settings specifying what parts of runPlp to execute

-
createExecuteSettings(
-  runSplitData = F,
-  runSampleData = F,
-  runfeatureEngineering = F,
-  runPreprocessData = F,
-  runModelDevelopment = F,
-  runCovariateSummary = F
-)
+
+
createExecuteSettings(
+  runSplitData = F,
+  runSampleData = F,
+  runfeatureEngineering = F,
+  runPreprocessData = F,
+  runModelDevelopment = F,
+  runCovariateSummary = F
+)
+
+ +
+

Arguments

+
runSplitData
+

TRUE or FALSE whether to split data into train/test

+ + +
runSampleData
+

TRUE or FALSE whether to over or under sample

+ + +
runfeatureEngineering
+

TRUE or FALSE whether to do feature engineering

+ + +
runPreprocessData
+

TRUE or FALSE whether to do preprocessing

-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
runSplitData

TRUE or FALSE whether to split data into train/test

runSampleData

TRUE or FALSE whether to over or under sample

runfeatureEngineering

TRUE or FALSE whether to do feature engineering

runPreprocessData

TRUE or FALSE whether to do preprocessing

runModelDevelopment

TRUE or FALSE whether to develop the model

runCovariateSummary

TRUE or FALSE whether to create covariate summary

-

Value

+
runModelDevelopment
+

TRUE or FALSE whether to develop the model

-

list with TRUE/FALSE for each part of runPlp

-

Details

+
runCovariateSummary
+

TRUE or FALSE whether to create covariate summary

+ +
+
+

Value

+ + +

list with TRUE/FALSE for each part of runPlp

+
+
+

Details

define what parts of runPlp to execute

+
+
-
- +
- - + + diff --git a/docs/reference/createExistingModelSql.html b/docs/reference/createExistingModelSql.html deleted file mode 100644 index 786f006d3..000000000 --- a/docs/reference/createExistingModelSql.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Apply an existing logistic regression prediction model — createExistingModelSql • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Apply an existing logistic regression prediction model

- -
- -
createExistingModelSql(modelTable, modelNames, interceptTable, covariateTable,
-  type = "logistic", analysisId = 112, covariateSettings, asFunctions = F,
-  customCovariates = NULL, e = environment(), covariateValues = F)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
modelTable

A dataframe or list of dataframes with columns: modelId, modelCovariateId, coefficientValue all doubles

modelNames

A name used in the covariate function names (no spaces)

interceptTable

A dataframe or list of dataframes with the columns: modelId, interceptValue

covariateTable

A dataframe or list of dataframes with columns: modelCovariateId, covariateId (the mapping of covariate_id to standard covariates)

type

The type of model: logistic or linear/score

analysisId

The covariate analysis_id (default 112)

covariateSettings

The settings for the standard covariates (needs for temporal settings)

asFunctions

If T then return two functions

customCovariates

enables custome SQL to be used to create custom covariates

e

The environment to output the covariate setting functions to

covariateValues

boolean Whether to also download the covariates that make up the risk score

- -

Details

- -

This function is used to create custom covariates corresponding to existing models

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/createFeatureEngineeringSettings.html b/docs/reference/createFeatureEngineeringSettings.html index 4a9ef8b11..18defa02b 100644 --- a/docs/reference/createFeatureEngineeringSettings.html +++ b/docs/reference/createFeatureEngineeringSettings.html @@ -1,67 +1,12 @@ - - - - - - - -Create the settings for defining any feature engineering that will be done — createFeatureEngineeringSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for defining any feature engineering that will be done — createFeatureEngineeringSettings • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,51 +131,51 @@

Create the settings for defining any feature engineering that will be doneCreate the settings for defining any feature engineering that will be done

-
createFeatureEngineeringSettings(type = "none")
+
+
createFeatureEngineeringSettings(type = "none")
+
-

Arguments

- - - - - - -
type

(character) Choice of:

    -
  • 'none' No feature engineering - this is the default

  • -
+
+

Arguments

+
type
+

(character) Choice of:

  • 'none' No feature engineering - this is the default

  • +
-

Value

+
+
+

Value

+ -

An object of class featureEngineeringSettings

-

Details

+

An object of class featureEngineeringSettings

+ +
+
+

Details

Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

+
+
-
- +
- - + + diff --git a/docs/reference/createLearningCurve.html b/docs/reference/createLearningCurve.html index ef6cc8335..f7e442e2f 100644 --- a/docs/reference/createLearningCurve.html +++ b/docs/reference/createLearningCurve.html @@ -1,68 +1,13 @@ - - - - - - - -createLearningCurve — createLearningCurve • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -createLearningCurve — createLearningCurve • PatientLevelPrediction - + + - - - -
-
- -
- -
+

Creates a learning curve object, which can be plotted using the - plotLearningCurve() function.

+ plotLearningCurve() function.

+
+ +
+
createLearningCurve(
+  plpData,
+  outcomeId,
+  parallel = T,
+  cores = 4,
+  modelSettings,
+  saveDirectory = getwd(),
+  analysisId = "learningCurve",
+  populationSettings = createStudyPopulationSettings(),
+  splitSettings = createDefaultSplitSetting(),
+  trainFractions = c(0.25, 0.5, 0.75),
+  trainEvents = NULL,
+  sampleSettings = createSampleSettings(),
+  featureEngineeringSettings = createFeatureEngineeringSettings(),
+  preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
+  logSettings = createLogSettings(),
+  executeSettings = createExecuteSettings(runSplitData = T, runSampleData = F,
+    runfeatureEngineering = F, runPreprocessData = T, runModelDevelopment = T,
+    runCovariateSummary = F)
+)
-
createLearningCurve(
-  plpData,
-  outcomeId,
-  parallel = T,
-  cores = 4,
-  modelSettings,
-  saveDirectory = getwd(),
-  analysisId = "learningCurve",
-  populationSettings = createStudyPopulationSettings(),
-  splitSettings = createDefaultSplitSetting(),
-  trainFractions = c(0.25, 0.5, 0.75),
-  trainEvents = c(500, 1000, 1500),
-  sampleSettings = createSampleSettings(),
-  featureEngineeringSettings = createFeatureEngineeringSettings(),
-  preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
-  logSettings = createLogSettings(),
-  executeSettings = createExecuteSettings(runSplitData = T, runSampleData = F,
-    runfeatureEngineering = F, runPreprocessData = T, runModelDevelopment = T,
-    runCovariateSummary = F)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData - the patient level prediction -data extracted from the CDM.

outcomeId

(integer) The ID of the outcome.

parallel

Whether to run the code in parallel

cores

The number of computer cores to use if running in parallel

modelSettings

An object of class modelSettings created using one of the function:

    -
  • setLassoLogisticRegression() A lasso logistic regression model

  • -
  • setGradientBoostingMachine() A gradient boosting machine

  • -
  • setAdaBoost() An ada boost model

  • -
  • setRandomForest() A random forest model

  • -
  • setDecisionTree() A decision tree model

  • -
  • setCovNN()) A convolutional neural network model

  • -
  • setCIReNN() A recurrent neural network model

  • -
  • setMLP() A neural network model

  • -
  • setDeepNN() A deep neural network model

  • -
  • setKNN() A KNN model

  • -
saveDirectory

The path to the directory where the results will be saved (if NULL uses working directory)

analysisId

(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

populationSettings

An object of type populationSettings created using createStudyPopulationSettings that +

+

Arguments

+
plpData
+

An object of type plpData - the patient level prediction +data extracted from the CDM.

+ + +
outcomeId
+

(integer) The ID of the outcome.

+ + +
parallel
+

Whether to run the code in parallel

+ + +
cores
+

The number of computer cores to use if running in parallel

+ + +
modelSettings
+

An object of class modelSettings created using one of the function:

+ + +
saveDirectory
+

The path to the directory where the results will be saved (if NULL uses working directory)

+ + +
analysisId
+

(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

+ + +
populationSettings
+

An object of type populationSettings created using createStudyPopulationSettings that specifies how the data class labels are defined and addition any exclusions to apply to the -plpData cohort

splitSettings

An object of type splitSettings that specifies how to split the data into train/validation/test. -The default settings can be created using createDefaultSplitSetting.

trainFractions

A list of training fractions to create models for. +plpData cohort

+ + +
splitSettings
+

An object of type splitSettings that specifies how to split the data into train/validation/test. +The default settings can be created using createDefaultSplitSetting.

+ + +
trainFractions
+

A list of training fractions to create models for. Note, providing trainEvents will override your input to -trainFractions.

trainEvents

Events have shown to be determinant of model performance. +trainFractions.

+ + +
trainEvents
+

Events have shown to be determinant of model performance. Therefore, it is recommended to provide trainEvents rather than trainFractions. Note, providing trainEvents will override -your input to trainFractions. The format should be as follows:

sampleSettings

An object of type sampleSettings that specifies any under/over sampling to be done. -The default is none.

featureEngineeringSettings

An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

preprocessSettings

An object of preprocessSettings. This setting specifies the minimum fraction of +your input to trainFractions. The format should be as follows:

  • c(500, 1000, 1500) - a list of training events

  • +
+ + +
sampleSettings
+

An object of type sampleSettings that specifies any under/over sampling to be done. +The default is none.

+ + +
featureEngineeringSettings
+

An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

+ + +
preprocessSettings
+

An object of preprocessSettings. This setting specifies the minimum fraction of target population who must have a covariate for it to be included in the model training -and whether to normalise the covariates before training

logSettings

An object of logSettings created using createLogSettings -specifying how the logging is done

executeSettings

An object of executeSettings specifying which parts of the analysis to run

- -

Value

- -

A learning curve object containing the various performance measures - obtained by the model for each training set fraction. It can be plotted - using plotLearningCurve.

+and whether to normalise the covariates before training

+ -

Examples

-
if (FALSE) { -# define model -modelSettings = PatientLevelPrediction::setLassoLogisticRegression() +
logSettings
+

An object of logSettings created using createLogSettings +specifying how the logging is done

-# create learning curve -learningCurve <- PatientLevelPrediction::createLearningCurve(population, - plpData, - modelSettings) -# plot learning curve -PatientLevelPrediction::plotLearningCurve(learningCurve) -} -
+
executeSettings
+

An object of executeSettings specifying which parts of the analysis to run

+ +
+
+

Value

+ + +

A learning curve object containing the various performance measures + obtained by the model for each training set fraction. It can be plotted + using plotLearningCurve.

+
+ +
+

Examples

+
if (FALSE) {
+# define model
+modelSettings = PatientLevelPrediction::setLassoLogisticRegression()
+
+# create learning curve
+learningCurve <- PatientLevelPrediction::createLearningCurve(population,
+                                                             plpData,
+                                                             modelSettings)
+# plot learning curve
+PatientLevelPrediction::plotLearningCurve(learningCurve)
+}
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/createLearningCurvePar.html b/docs/reference/createLearningCurvePar.html deleted file mode 100644 index bf980a2e2..000000000 --- a/docs/reference/createLearningCurvePar.html +++ /dev/null @@ -1,382 +0,0 @@ - - - - - - - - -createLearningCurvePar — createLearningCurvePar • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Creates a learning curve in parallel, which can be plotted using - the plotLearningCurve() function. Currently this functionality is - only supported by Lasso Logistic Regression.

-
- -
createLearningCurvePar(
-  population,
-  plpData,
-  modelSettings,
-  testSplit = "stratified",
-  testFraction = 0.25,
-  trainFractions = c(0.25, 0.5, 0.75),
-  trainEvents = NULL,
-  splitSeed = NULL,
-  nfold = 3,
-  indexes = NULL,
-  verbosity = "TRACE",
-  minCovariateFraction = 0.001,
-  normalizeData = T,
-  saveDirectory = getwd(),
-  savePlpData = F,
-  savePlpResult = F,
-  savePlpPlots = F,
-  saveEvaluation = F,
-  timeStamp = FALSE,
-  analysisId = "lc-",
-  cores = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
population

The population created using createStudyPopulation() -that will be used to develop the model.

plpData

An object of type plpData - the patient level -prediction data extracted from the CDM.

modelSettings

An object of class modelSettings created using -one of the function. Currently only one model is supported:

    -
  • setLassoLogisticRegression - a lasso logistic regression - model

  • -
testSplit

Specifies the type of evaluation used. Can be either -'person' or 'time'. The value 'time' finds the date -that splots the population into the testing and training fractions -provided. Patients with an index after this date are assigned to the test -set and patients with an index prior to this date are assigned to the -training set. The value 'person' splits the data randomly into -testing and training sets according to fractions provided. The split is -stratified by the class label.

testFraction

The fraction of the data, which will be used as the -testing set in the patient split evaluation.

trainFractions

A list of training fractions to create models for. -Note, providing trainEvents will override your input to -trainFractions.

trainEvents

Events have shown to be determinant of model performance. -Therefore, it is recommended to provide trainEvents rather than -trainFractions. Note, providing trainEvents will override -your input to trainFractions. The format should be as follows:

splitSeed

The seed used to split the testing and training set when -using a 'person' type split

nfold

The number of folds used in the cross validation (default = -3).

indexes

A dataframe containing a rowId and index column where the -index value of -1 means in the test set, and positive integer represents -the cross validation fold (default is NULL).

verbosity

Sets the level of the verbosity. If the log level is at or -higher in priority than the logger threshold, a message will print. The -levels are:

    -
  • DEBUG - highest verbosity showing all debug statements

  • -
  • TRACE - showing information about start and end of steps

  • -
  • INFO - show informative messages (default)

  • -
  • WARN - show warning messages

  • -
  • ERROR - show error messages

  • -
  • FATAL - be silent except for fatal errors

  • -
minCovariateFraction

Minimum covariate prevalence in population to -avoid removal during preprocssing.

normalizeData

Whether to normalise the data

saveDirectory

Location to save log and results

savePlpData

Whether to save the plpData

savePlpResult

Whether to save the plpResult

savePlpPlots

Whether to save the plp plots

saveEvaluation

Whether to save the plp performance csv files

timeStamp

Include a timestamp in the log

analysisId

The analysis unique identifier

cores

The number of cores to use

- -

Value

- -

A learning curve object containing the various performance measures - obtained by the model for each training set fraction. It can be plotted - using plotLearningCurve.

- -

Examples

-
if (FALSE) { -# define model -modelSettings = setLassoLogisticRegression() - -# register parallel backend -registerParallelBackend() - -# create learning curve -learningCurve <- createLearningCurvePar(population, - plpData, - modelSettings) -# plot learning curve -plotLearningCurve(learningCurve) -} - -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/createLogSettings.html b/docs/reference/createLogSettings.html index a37373f1c..05c7e8c73 100644 --- a/docs/reference/createLogSettings.html +++ b/docs/reference/createLogSettings.html @@ -1,67 +1,12 @@ - - - - - - - -Create the settings for logging the progression of the analysis — createLogSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for logging the progression of the analysis — createLogSettings • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,64 +131,64 @@

Create the settings for logging the progression of the analysis

Create the settings for logging the progression of the analysis

-
createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log")
- -

Arguments

- - - - - - - - - - - - - - -
verbosity

Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

    -
  • DEBUGHighest verbosity showing all debug statements

  • -
  • TRACEShowing information about start and end of steps

  • -
  • INFOShow informative information (Default)

  • -
  • WARNShow warning messages

  • -
  • ERRORShow error messages

  • -
  • FATALBe silent except for fatal errors

  • -
timeStamp

If TRUE a timestamp will be added to each logging statement. Automatically switched on for TRACE level.

logName

A string reference for the logger

- -

Value

- -

An object of class logSettings

-

Details

+
+
createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log")
+
+
+

Arguments

+
verbosity
+

Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

  • DEBUG Highest verbosity showing all debug statements

  • +
  • TRACE Showing information about start and end of steps

  • +
  • INFO Show informative information (Default)

  • +
  • WARN Show warning messages

  • +
  • ERROR Show error messages

  • +
  • FATAL Be silent except for fatal errors

  • +
+ + +
timeStamp
+

If TRUE a timestamp will be added to each logging statement. Automatically switched on for TRACE level.

+ + +
logName
+

A string reference for the logger

+ +
+
+

Value

+ + +

An object of class logSettings

+ + +
+
+

Details

Returns an object of class logSettings that specifies the logger settings

+
+
-
- +
- - + + diff --git a/docs/reference/createLrSql.html b/docs/reference/createLrSql.html deleted file mode 100644 index 754439f3c..000000000 --- a/docs/reference/createLrSql.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - - - -Convert logistic regression model to sql code... — createLrSql • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Convert logistic regression model to sql code...

-
- -
createLrSql(
-  models,
-  modelNames,
-  covariateConstructionName = "prediction",
-  modelTable = "#model_table",
-  analysisId = 111,
-  e = environment(),
-  databaseOutput = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
models

A trianed plp model.

modelNames

A name used in the covariate function names (no spaces)

covariateConstructionName

the name used for the create covariate function

modelTable

The temporary table name storing the model details

analysisId

The covariate analysis_id

e

The environment to output the covariate setting functions to

databaseOutput

If you want to output to go inot a cohort table add the "database.schema.tablename" here

- -

Details

- -

This function is used to create custom covariates for a logistic regression model -(currently only supports, demographics/conditions/drug/procedures/observations and measurement concepts)

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/createModelDesign.html b/docs/reference/createModelDesign.html index 1277b3cb9..f1fdfafec 100644 --- a/docs/reference/createModelDesign.html +++ b/docs/reference/createModelDesign.html @@ -1,67 +1,12 @@ - - - - - - - -Specify settings for deceloping a single model — createModelDesign • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Specify settings for deceloping a single model — createModelDesign • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,96 +131,101 @@

Specify settings for deceloping a single model

Specify settings for deceloping a single model

-
createModelDesign(
-  targetId,
-  outcomeId,
-  restrictPlpDataSettings = createRestrictPlpDataSettings(),
-  populationSettings = createStudyPopulationSettings(),
-  covariateSettings = FeatureExtraction::createDefaultCovariateSettings(),
-  featureEngineeringSettings = NULL,
-  sampleSettings = NULL,
-  preprocessSettings = NULL,
-  modelSettings = NULL,
-  runCovariateSummary = T
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
targetId

The id of the target cohort that will be used for data extraction (e.g., the ATLAS id)

outcomeId

The id of the outcome that will be used for data extraction (e.g., the ATLAS id)

restrictPlpDataSettings

The settings specifying the extra restriction settings when extracting the data created using createRestrictPlpDataSettings().

populationSettings

The population settings specified by createStudyPopulationSettings()

covariateSettings

The covariate settings, this can be a list or a single 'covariateSetting' object.

featureEngineeringSettings

Either NULL or an object of class featureEngineeringSettings specifying any feature engineering used during model development

sampleSettings

Either NULL or an object of class sampleSettings with the over/under sampling settings used for model development

preprocessSettings

Either NULL or an object of class preprocessSettings created using createPreprocessingSettings()

modelSettings

The model settings such as setLassoLogisticRegression()

runCovariateSummary

Whether to run the covariateSummary

- -

Value

- -

A list with analysis settings used to develop a single prediction model

-

Details

+
+
createModelDesign(
+  targetId,
+  outcomeId,
+  restrictPlpDataSettings = createRestrictPlpDataSettings(),
+  populationSettings = createStudyPopulationSettings(),
+  covariateSettings = FeatureExtraction::createDefaultCovariateSettings(),
+  featureEngineeringSettings = NULL,
+  sampleSettings = NULL,
+  preprocessSettings = NULL,
+  modelSettings = NULL,
+  splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
+    trainFraction = 0.75, splitSeed = 123, nfold = 3),
+  runCovariateSummary = T
+)
+
+ +
+

Arguments

+
targetId
+

The id of the target cohort that will be used for data extraction (e.g., the ATLAS id)

+ + +
outcomeId
+

The id of the outcome that will be used for data extraction (e.g., the ATLAS id)

+ + +
restrictPlpDataSettings
+

The settings specifying the extra restriction settings when extracting the data created using createRestrictPlpDataSettings().

+ + +
populationSettings
+

The population settings specified by createStudyPopulationSettings()

+ + +
covariateSettings
+

The covariate settings, this can be a list or a single 'covariateSetting' object.

+ +
featureEngineeringSettings
+

Either NULL or an object of class featureEngineeringSettings specifying any feature engineering used during model development

+ + +
sampleSettings
+

Either NULL or an object of class sampleSettings with the over/under sampling settings used for model development

+ + +
preprocessSettings
+

Either NULL or an object of class preprocessSettings created using createPreprocessingSettings()

+ + +
modelSettings
+

The model settings such as setLassoLogisticRegression()

+ + +
splitSettings
+

The train/validation/test splitting used by all analyses created using createDefaultSplitSetting()

+ + +
runCovariateSummary
+

Whether to run the covariateSummary

+ +
+
+

Value

+ + +

A list with analysis settings used to develop a single prediction model

+
+
+

Details

This specifies a single analysis for developing as single model

+
+
-
- +

- - + + diff --git a/docs/reference/createPlpJournalDocument.html b/docs/reference/createPlpJournalDocument.html deleted file mode 100644 index 7639d076c..000000000 --- a/docs/reference/createPlpJournalDocument.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - - -createPlpJournalDocument — createPlpJournalDocument • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Creates a template for a prediction journal paper with the characteristics/results filled in

-
- -
createPlpJournalDocument(
-  plpResult = NULL,
-  plpValidation = NULL,
-  plpData = NULL,
-  targetName = "<target population>",
-  outcomeName = "<outcome>",
-  table1 = F,
-  connectionDetails = NULL,
-  includeTrain = FALSE,
-  includeTest = TRUE,
-  includePredictionPicture = TRUE,
-  includeAttritionPlot = TRUE,
-  outputLocation = file.path(getwd(), "plp_journal_document.docx"),
-  save = T
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

An object of type plpResult returned by running runPlp()

plpValidation

An object of type validatePlp returned by running externalValidatePlp()

plpData

The plpData

targetName

A string with the target description name

outcomeName

A string with the outcome description name

table1

Whether to include table1 (characteristics)

connectionDetails

The connection required to calcualte the characteristics

includeTrain

Whether to include the train set performance

includeTest

Whether to include the test set performance

includePredictionPicture

Whether to include a picture detailing the prediction problem

includeAttritionPlot

Whether to include the attriction plot

outputLocation

The location to write the document to

save

If false this fucntion returns the document and does not save to outputLocation

- -

Value

- -

A work document containing the selected outputs within the user's directory at location specified in outputLocation

-

Details

- -

The function creates a word document containing the analysis details, data summary and prediction model results.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/createPlpModelSettings.html b/docs/reference/createPlpModelSettings.html deleted file mode 100644 index 82d054b9e..000000000 --- a/docs/reference/createPlpModelSettings.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -create a an object specifying the multiple Plp model settings — createPlpModelSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

create a an object specifying the multiple Plp model settings

-
- -
createPlpModelSettings(modelList, covariateSettingList, populationSettingList)
- -

Arguments

- - - - - - - - - - - - - - -
modelList

A list of model settings

covariateSettingList

A list of covariate settings

populationSettingList

A list of population settings

- -

Value

- -

A list containing a dataframe settingLookupTable containing all the model, covariate and popualtion combination details, -a list models containing all the model settings, a list covariateSettings containing all the covariate settings and a list -populationSettings containing all the population settings.

-

Details

- -

Takes a list of models, covariates, population and returns the cartesian product combining all -settings.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/createPlpReport.html b/docs/reference/createPlpReport.html deleted file mode 100644 index 6a956a89e..000000000 --- a/docs/reference/createPlpReport.html +++ /dev/null @@ -1,267 +0,0 @@ - - - - - - - - -createPlpReport — createPlpReport • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Creates a word document report of the prediction

-
- -
createPlpReport(
-  plpResult = NULL,
-  plpValidation = NULL,
-  plpData = NULL,
-  targetName = "<target population>",
-  outcomeName = "<outcome>",
-  targetDefinition = NULL,
-  outcomeDefinition = NULL,
-  outputLocation = file.path(getwd(), "plp_report.docx"),
-  save = T
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

An object of type plpResult returned by running runPlp()

plpValidation

An object of type validatePlp returned by running externalValidatePlp()

plpData

The plpData

targetName

A string with the target description name

outcomeName

A string with the outcome description name

targetDefinition

The cohort details

outcomeDefinition

The cohort details

outputLocation

The location to write the document to

save

If false the output of the function of the function is the document rather than creating the document in outputLocation

- -

Value

- -

A work document containing the selected outputs within the user's directory at location specified in outputLocation

-

Details

- -

The function creates a word document containing the analysis details, data summary and prediction model results.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/createPlpResultTables.html b/docs/reference/createPlpResultTables.html index 742a4f777..41c22b897 100644 --- a/docs/reference/createPlpResultTables.html +++ b/docs/reference/createPlpResultTables.html @@ -1,67 +1,12 @@ - - - - - - - -Create the results tables to store PatientLevelPrediction models and results into a database — createPlpResultTables • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the results tables to store PatientLevelPrediction models and results into a database — createPlpResultTables • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,88 +131,85 @@

Create the results tables to store PatientLevelPrediction models and results

This function executes a large set of SQL statements to create tables that can store models and results

-
createPlpResultTables(
-  conn,
-  resultSchema,
-  targetDialect = "postgresql",
-  deleteExistingTables = T,
-  createTables = T,
-  stringAppendToTables = "",
-  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
-  testFile = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
conn

A connection to a database created by using the -function connect in the -DatabaseConnector package.

resultSchema

The name of the database schema that the result tables will be created.

targetDialect

The database management system being used

deleteExistingTables

If true any existing tables matching the PatientLevelPrediction result tables names will be deleted

createTables

If true the PatientLevelPrediction result tables will be created

stringAppendToTables

A string that appends to the PatientLevelPrediction result tables

tempEmulationSchema

The temp schema used when the database management system is oracle

testFile

(used for testing) The location of an sql file with the table creation code

- -

Value

- -

Returns NULL but creates the required tables into the specified database schema.

-

Details

+
+
createPlpResultTables(
+  connectionDetails,
+  targetDialect = "postgresql",
+  resultSchema,
+  deleteTables = T,
+  createTables = T,
+  tablePrefix = "",
+  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+  testFile = NULL
+)
+
+ +
+

Arguments

+
connectionDetails
+

The database connection details

+ + +
targetDialect
+

The database management system being used

+ + +
resultSchema
+

The name of the database schema that the result tables will be created.

+ +
deleteTables
+

If true any existing tables matching the PatientLevelPrediction result tables names will be deleted

+ + +
createTables
+

If true the PatientLevelPrediction result tables will be created

+ + +
tablePrefix
+

A string that appends to the PatientLevelPrediction result tables

+ + +
tempEmulationSchema
+

The temp schema used when the database management system is oracle

+ + +
testFile
+

(used for testing) The location of an sql file with the table creation code

+ +
+
+

Value

+ + +

Returns NULL but creates the required tables into the specified database schema(s).

+
+
+

Details

This function can be used to create (or delete) PatientLevelPrediction result tables

+
+
-
- +
- - + + diff --git a/docs/reference/createPreprocessSettings.html b/docs/reference/createPreprocessSettings.html index 0beec10c9..74d078c2b 100644 --- a/docs/reference/createPreprocessSettings.html +++ b/docs/reference/createPreprocessSettings.html @@ -1,67 +1,12 @@ - - - - - - - -Create the settings for preprocessing the trainData using . — createPreprocessSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for preprocessing the trainData. — createPreprocessSettings • PatientLevelPrediction - + + - - - -
-
- -
- -
+
-

Create the settings for preprocessing the trainData using .

+

Create the settings for preprocessing the trainData.

-
createPreprocessSettings(
-  minFraction = 0.001,
-  normalize = TRUE,
-  removeRedundancy = TRUE
-)
- -

Arguments

- - - - - - - - - - - - - - -
minFraction

The minimum fraction of target population who must have a covariate for it to be included in the model training

normalize

Whether to normalise the covariates before training (Default: TRUE)

removeRedundancy

Whether to remove redundant features (Default: TRUE)

- -

Value

- -

An object of class preprocessingSettings

-

Details

+
+
createPreprocessSettings(
+  minFraction = 0.001,
+  normalize = TRUE,
+  removeRedundancy = TRUE
+)
+
+
+

Arguments

+
minFraction
+

The minimum fraction of target population who must have a covariate for it to be included in the model training

+ + +
normalize
+

Whether to normalise the covariates before training (Default: TRUE)

+ + +
removeRedundancy
+

Whether to remove redundant features (Default: TRUE)

+ +
+
+

Value

+ + +

An object of class preprocessingSettings

+ + +
+
+

Details

Returns an object of class preprocessingSettings that specifies how to preprocess the training data

+
+
-
- +
- - + + diff --git a/docs/reference/createRandomForestFeatureSelection.html b/docs/reference/createRandomForestFeatureSelection.html index 7cee442c3..3b2803c55 100644 --- a/docs/reference/createRandomForestFeatureSelection.html +++ b/docs/reference/createRandomForestFeatureSelection.html @@ -1,67 +1,12 @@ - - - - - - - -Create the settings for random foreat based feature selection — createRandomForestFeatureSelection • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for random foreat based feature selection — createRandomForestFeatureSelection • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,53 +131,54 @@

Create the settings for random foreat based feature selection

Create the settings for random foreat based feature selection

-
createRandomForestFeatureSelection(ntrees = 2000, maxDepth = 17)
+
+
createRandomForestFeatureSelection(ntrees = 2000, maxDepth = 17)
+
+ +
+

Arguments

+
ntrees
+

number of tree in forest

+ -

Arguments

- - - - - - - - - - -
ntrees

number of tree in forest

maxDepth

MAx depth of each tree

+
maxDepth
+

MAx depth of each tree

-

Value

+
+
+

Value

+ -

An object of class featureEngineeringSettings

-

Details

+

An object of class featureEngineeringSettings

+ +
+
+

Details

Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

+
+
-
- +
- - + + diff --git a/docs/reference/createRestrictPlpDataSettings.html b/docs/reference/createRestrictPlpDataSettings.html index 2e1ab7462..923489204 100644 --- a/docs/reference/createRestrictPlpDataSettings.html +++ b/docs/reference/createRestrictPlpDataSettings.html @@ -1,67 +1,12 @@ - - - - - - - -createRestrictPlpDataSettings define extra restriction settings when calling getPlpData — createRestrictPlpDataSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -createRestrictPlpDataSettings define extra restriction settings when calling getPlpData — createRestrictPlpDataSettings • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,80 +131,79 @@

createRestrictPlpDataSettings define extra restriction settings when calling

This function creates the settings used to restrict the target cohort when calling getPlpData

-
createRestrictPlpDataSettings(
-  studyStartDate = "",
-  studyEndDate = "",
-  firstExposureOnly = F,
-  washoutPeriod = 0,
-  sampleSize = NULL
-)
+
+
createRestrictPlpDataSettings(
+  studyStartDate = "",
+  studyEndDate = "",
+  firstExposureOnly = F,
+  washoutPeriod = 0,
+  sampleSize = NULL
+)
+
+ +
+

Arguments

+
studyStartDate
+

A calendar date specifying the minimum date that a cohort index +date can appear. Date format is 'yyyymmdd'.

+ -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
studyStartDate

A calendar date specifying the minimum date that a cohort index -date can appear. Date format is 'yyyymmdd'.

studyEndDate

A calendar date specifying the maximum date that a cohort index +

studyEndDate
+

A calendar date specifying the maximum date that a cohort index date can appear. Date format is 'yyyymmdd'. Important: the study end data is also used to truncate risk windows, meaning no outcomes -beyond the study end date will be considered.

firstExposureOnly

Should only the first exposure per subject be included? Note that +beyond the study end date will be considered.

+ + +
firstExposureOnly
+

Should only the first exposure per subject be included? Note that this is typically done in the createStudyPopulation function, -but can already be done here for efficiency reasons.

washoutPeriod

The mininum required continuous observation time prior to index +but can already be done here for efficiency reasons.

+ + +
washoutPeriod
+

The mininum required continuous observation time prior to index date for a person to be included in the at risk cohort. Note that this is typically done in the createStudyPopulation function, -but can already be done here for efficiency reasons.

sampleSize

If not NULL, the number of people to sample from the target cohort

+but can already be done here for efficiency reasons.

+ -

Value

+
sampleSize
+

If not NULL, the number of people to sample from the target cohort

-

A setting object of class restrictPlpDataSettings containing a list getPlpData extra settings

-

Details

+
+
+

Value

+ +

A setting object of class restrictPlpDataSettings containing a list getPlpData extra settings

+
+
+

Details

Users need to specify the extra restrictions to apply when downloading the target cohort

+
+
-
- +
- - + + diff --git a/docs/reference/createSampleSettings.html b/docs/reference/createSampleSettings.html index 049018f9a..f2723ab9e 100644 --- a/docs/reference/createSampleSettings.html +++ b/docs/reference/createSampleSettings.html @@ -1,70 +1,15 @@ - - - - - - - -Create the settings for defining how the trainData from splitData are sampled using -default sample functions. — createSampleSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for defining how the trainData from splitData are sampled using +default sample functions. — createSampleSettings • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -188,65 +136,65 @@

Create the settings for defining how the trainData from splitData

-
createSampleSettings(
-  type = "none",
-  numberOutcomestoNonOutcomes = 1,
-  sampleSeed = sample(10000, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - -
type

(character) Choice of:

    -
  • 'none' No sampling is applied - this is the default

  • -
  • 'underSample')Undersample the non-outcome class to make the data more ballanced

  • -
  • 'overSample'Oversample the outcome class by adding in each outcome multiple times

  • -
numberOutcomestoNonOutcomes

(numeric) An numeric specifying the require number of non-outcomes per outcome

sampleSeed

(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)

- -

Value

- -

An object of class sampleSettings

-

Details

+
+
createSampleSettings(
+  type = "none",
+  numberOutcomestoNonOutcomes = 1,
+  sampleSeed = sample(10000, 1)
+)
+
+
+

Arguments

+
type
+

(character) Choice of:

  • 'none' No sampling is applied - this is the default

  • +
  • 'underSample' Undersample the non-outcome class to make the data more ballanced

  • +
  • 'overSample' Oversample the outcome class by adding in each outcome multiple times

  • +
+ + +
numberOutcomestoNonOutcomes
+

(numeric) An numeric specifying the require number of non-outcomes per outcome

+ + +
sampleSeed
+

(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)

+ +
+
+

Value

+ + +

An object of class sampleSettings

+ + +
+
+

Details

Returns an object of class sampleSettings that specifies the sampling function that will be called and the settings

+
+
-
- +
- - + + diff --git a/docs/reference/createSplineSettings.html b/docs/reference/createSplineSettings.html new file mode 100644 index 000000000..9b04d1a1b --- /dev/null +++ b/docs/reference/createSplineSettings.html @@ -0,0 +1,188 @@ + +Create the settings for adding a spline for continuous variables — createSplineSettings • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Create the settings for adding a spline for continuous variables

+
+ +
+
createSplineSettings(continousCovariateId, knots, analysisId = 683)
+
+ +
+

Arguments

+
continousCovariateId
+

The covariateId to apply splines to

+ + +
knots
+

Either number of knots of vector of split values

+ + +
analysisId
+

The analysisId to use for the spline covariates

+ +
+
+

Value

+ + +

An object of class featureEngineeringSettings

+ + +
+
+

Details

+

Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createStratifiedImputationSettings.html b/docs/reference/createStratifiedImputationSettings.html new file mode 100644 index 000000000..753176d9b --- /dev/null +++ b/docs/reference/createStratifiedImputationSettings.html @@ -0,0 +1,184 @@ + +Create the settings for adding a spline for continuous variables — createStratifiedImputationSettings • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Create the settings for adding a spline for continuous variables

+
+ +
+
createStratifiedImputationSettings(covariateId, ageSplits = NULL)
+
+ +
+

Arguments

+
covariateId
+

The covariateId that needs imputed values

+ + +
ageSplits
+

A vector of age splits in years to create age groups

+ +
+
+

Value

+ + +

An object of class featureEngineeringSettings

+ + +
+
+

Details

+

Returns an object of class featureEngineeringSettings that specifies how to do stratified imputation

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createStudyPopulation.html b/docs/reference/createStudyPopulation.html index fa9c31d7f..70fb48349 100644 --- a/docs/reference/createStudyPopulation.html +++ b/docs/reference/createStudyPopulation.html @@ -1,67 +1,12 @@ - - - - - - - -Create a study population — createStudyPopulation • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create a study population — createStudyPopulation • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,78 +131,86 @@

Create a study population

Create a study population

-
createStudyPopulation(
-  plpData,
-  outcomeId,
-  populationSettings,
-  population = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData as generated using -getplpData.

outcomeId

The ID of the outcome.

populationSettings

An object of class populationSettings created using createPopulationSettings

population

If specified, this population will be used as the starting point instead of the -cohorts in the plpData object.

- -

Value

- -

A data frame specifying the study population. This data frame will have the following columns:

-
rowId

A unique identifier for an exposure

-
subjectId

The person ID of the subject

-
cohortStartdate

The index date

-
outcomeCount

The number of outcomes observed during the risk window

-
timeAtRisk

The number of days in the risk window

-
survivalTime

The number of days until either the outcome or the end of the risk window

- -
- -

Details

+
+
createStudyPopulation(
+  plpData,
+  outcomeId,
+  populationSettings,
+  population = NULL
+)
+
+ +
+

Arguments

+
plpData
+

An object of type plpData as generated using +getplpData.

+ + +
outcomeId
+

The ID of the outcome.

+ +
populationSettings
+

An object of class populationSettings created using createPopulationSettings

+ + +
population
+

If specified, this population will be used as the starting point instead of the +cohorts in the plpData object.

+ +
+
+

Value

+ + +

A data frame specifying the study population. This data frame will have the following columns:

rowId
+

A unique identifier for an exposure

+ +
subjectId
+

The person ID of the subject

+ +
cohortStartdate
+

The index date

+ +
outcomeCount
+

The number of outcomes observed during the risk window

+ +
timeAtRisk
+

The number of days in the risk window

+ +
survivalTime
+

The number of days until either the outcome or the end of the risk window

+ + +
+
+

Details

Create a study population by enforcing certain inclusion and exclusion criteria, defining a risk window, and determining which outcomes fall inside the risk window.

+
+
-
- +
- - + + diff --git a/docs/reference/createStudyPopulationSettings.html b/docs/reference/createStudyPopulationSettings.html index d6e0ed341..e40884cc2 100644 --- a/docs/reference/createStudyPopulationSettings.html +++ b/docs/reference/createStudyPopulationSettings.html @@ -1,67 +1,12 @@ - - - - - - - -create the study population settings — createStudyPopulationSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -create the study population settings — createStudyPopulationSettings • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,117 +131,116 @@

create the study population settings

create the study population settings

-
createStudyPopulationSettings(
-  binary = T,
-  includeAllOutcomes = T,
-  firstExposureOnly = FALSE,
-  washoutPeriod = 0,
-  removeSubjectsWithPriorOutcome = TRUE,
-  priorOutcomeLookback = 99999,
-  requireTimeAtRisk = T,
-  minTimeAtRisk = 364,
-  riskWindowStart = 1,
-  startAnchor = "cohort start",
-  riskWindowEnd = 365,
-  endAnchor = "cohort start",
-  restrictTarToCohortEnd = F
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
binary

Forces the outcomeCount to be 0 or 1 (use for binary prediction problems)

includeAllOutcomes

(binary) indicating whether to include people with outcomes who are not observed for the whole at risk period

firstExposureOnly

Should only the first exposure per subject be included? Note that -this is typically done in the createStudyPopulation function,

washoutPeriod

The mininum required continuous observation time prior to index -date for a person to be included in the cohort.

removeSubjectsWithPriorOutcome

Remove subjects that have the outcome prior to the risk window start?

priorOutcomeLookback

How many days should we look back when identifying prior outcomes?

requireTimeAtRisk

Should subject without time at risk be removed?

minTimeAtRisk

The minimum number of days at risk required to be included

riskWindowStart

The start of the risk window (in days) relative to the index date (+ +

+
createStudyPopulationSettings(
+  binary = T,
+  includeAllOutcomes = T,
+  firstExposureOnly = FALSE,
+  washoutPeriod = 0,
+  removeSubjectsWithPriorOutcome = TRUE,
+  priorOutcomeLookback = 99999,
+  requireTimeAtRisk = T,
+  minTimeAtRisk = 364,
+  riskWindowStart = 1,
+  startAnchor = "cohort start",
+  riskWindowEnd = 365,
+  endAnchor = "cohort start",
+  restrictTarToCohortEnd = F
+)
+
+ +
+

Arguments

+
binary
+

Forces the outcomeCount to be 0 or 1 (use for binary prediction problems)

+ + +
includeAllOutcomes
+

(binary) indicating whether to include people with outcomes who are not observed for the whole at risk period

+ + +
firstExposureOnly
+

Should only the first exposure per subject be included? Note that +this is typically done in the createStudyPopulation function,

+ + +
washoutPeriod
+

The mininum required continuous observation time prior to index +date for a person to be included in the cohort.

+ + +
removeSubjectsWithPriorOutcome
+

Remove subjects that have the outcome prior to the risk window start?

+ + +
priorOutcomeLookback
+

How many days should we look back when identifying prior outcomes?

+ + +
requireTimeAtRisk
+

Should subject without time at risk be removed?

+ + +
minTimeAtRisk
+

The minimum number of days at risk required to be included

+ + +
riskWindowStart
+

The start of the risk window (in days) relative to the index date (+ days of exposure if the addExposureDaysToStart parameter is -specified).

startAnchor

The anchor point for the start of the risk window. Can be "cohort start" or "cohort end".

riskWindowEnd

The end of the risk window (in days) relative to the index data (+ +specified).

+ + +
startAnchor
+

The anchor point for the start of the risk window. Can be "cohort start" or "cohort end".

+ + +
riskWindowEnd
+

The end of the risk window (in days) relative to the index data (+ days of exposure if the addExposureDaysToEnd parameter is -specified).

endAnchor

The anchor point for the end of the risk window. Can be "cohort start" or "cohort end".

restrictTarToCohortEnd

If using a survival model and you want the time-at-risk to end at the cohort end date set this to T

- -

Value

- -

A list containing all the settings required for creating the study population

-

Details

+specified).

+ +
endAnchor
+

The anchor point for the end of the risk window. Can be "cohort start" or "cohort end".

+ + +
restrictTarToCohortEnd
+

If using a survival model and you want the time-at-risk to end at the cohort end date set this to T

+ +
+
+

Value

+ + +

A list containing all the settings required for creating the study population

+
+
+

Details

Takes as input the inputs to create study population

+
+
-
- +
- - + + diff --git a/docs/reference/createTempModelLoc.html b/docs/reference/createTempModelLoc.html new file mode 100644 index 000000000..e91a74cf3 --- /dev/null +++ b/docs/reference/createTempModelLoc.html @@ -0,0 +1,162 @@ + +Create a temporary model location — createTempModelLoc • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Create a temporary model location

+
+ +
+
createTempModelLoc()
+
+ + +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createUnivariateFeatureSelection.html b/docs/reference/createUnivariateFeatureSelection.html index 8c7eab965..4f4c8fb19 100644 --- a/docs/reference/createUnivariateFeatureSelection.html +++ b/docs/reference/createUnivariateFeatureSelection.html @@ -1,67 +1,12 @@ - - - - - - - -Create the settings for defining any feature selection that will be done — createUnivariateFeatureSelection • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create the settings for defining any feature selection that will be done — createUnivariateFeatureSelection • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,49 +131,50 @@

Create the settings for defining any feature selection that will be done

Create the settings for defining any feature selection that will be done

-
createUnivariateFeatureSelection(k = 100)
+
+
createUnivariateFeatureSelection(k = 100)
+
-

Arguments

- - - - - - -
k

This function returns the K features most associated (univariately) to the outcome

+
+

Arguments

+
k
+

This function returns the K features most associated (univariately) to the outcome

-

Value

+
+
+

Value

+ -

An object of class featureEngineeringSettings

-

Details

+

An object of class featureEngineeringSettings

+ +
+
+

Details

Returns an object of class featureEngineeringSettings that specifies the sampling function that will be called and the settings

+
+
-
- +
- - + + diff --git a/docs/reference/createValidationDesign.html b/docs/reference/createValidationDesign.html new file mode 100644 index 000000000..f3601d2c9 --- /dev/null +++ b/docs/reference/createValidationDesign.html @@ -0,0 +1,200 @@ + +createValidationDesign - Define the validation design for external validation — createValidationDesign • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

createValidationDesign - Define the validation design for external validation

+
+ +
+
createValidationDesign(
+  targetId,
+  outcomeId,
+  populationSettings,
+  restrictPlpDataSettings,
+  plpModelList,
+  recalibrate = NULL,
+  runCovariateSummary = TRUE
+)
+
+ +
+

Arguments

+
targetId
+

The targetId of the target cohort to validate on

+ + +
outcomeId
+

The outcomeId of the outcome cohort to validate on

+ + +
populationSettings
+

A list of population restriction settings created by createPopulationSettings

+ + +
restrictPlpDataSettings
+

A list of plpData restriction settings created by createRestrictPlpDataSettings

+ + +
plpModelList
+

A list of plpModels objects created by runPlp or a path to such objects

+ + +
recalibrate
+

A vector of characters specifying the recalibration method to apply,

+ + +
runCovariateSummary
+

whether to run the covariate summary for the validation data

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/createValidationSettings.html b/docs/reference/createValidationSettings.html index cd816a609..c21539127 100644 --- a/docs/reference/createValidationSettings.html +++ b/docs/reference/createValidationSettings.html @@ -1,67 +1,12 @@ - - - - - - - -createValidationSettings define optional settings for performing external validation — createValidationSettings • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -createValidationSettings define optional settings for performing external validation — createValidationSettings • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,53 +131,52 @@

createValidationSettings define optional settings for performing external va

This function creates the settings required by externalValidatePlp

-
createValidationSettings(recalibrate = NULL, runCovariateSummary = T)
+
+
createValidationSettings(recalibrate = NULL, runCovariateSummary = T)
+
-

Arguments

- - - - - - - - - - -
recalibrate

A vector of characters specifying the recalibration method to apply

runCovariateSummary

Whether to run the covariate summary for the validation data

+
+

Arguments

+
recalibrate
+

A vector of characters specifying the recalibration method to apply

-

Value

-

A setting object of class validationSettings containing a list of settings for externalValidatePlp

-

Details

+
runCovariateSummary
+

Whether to run the covariate summary for the validation data

+
+
+

Value

+ + +

A setting object of class validationSettings containing a list of settings for externalValidatePlp

+
+
+

Details

Users need to specify whether they want to sample or recalibate when performing external validation

+
+
-
- +

- - + + diff --git a/docs/reference/diagnoseMultiplePlp.html b/docs/reference/diagnoseMultiplePlp.html new file mode 100644 index 000000000..5c24f5c05 --- /dev/null +++ b/docs/reference/diagnoseMultiplePlp.html @@ -0,0 +1,203 @@ + +Run a list of predictions diagnoses — diagnoseMultiplePlp • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Run a list of predictions diagnoses

+
+ +
+
diagnoseMultiplePlp(
+  databaseDetails = createDatabaseDetails(),
+  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
+    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
+    modelSettings = setLassoLogisticRegression())),
+  cohortDefinitions = NULL,
+  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+    "diagnosePlp Log"),
+  saveDirectory = getwd()
+)
+
+ +
+

Arguments

+
databaseDetails
+

The database settings created using createDatabaseDetails()

+ + +
modelDesignList
+

A list of model designs created using createModelDesign()

+ + +
cohortDefinitions
+

A list of cohort definitions for the target and outcome cohorts

+ + +
logSettings
+

The setting spexcifying the logging for the analyses created using createLogSettings()

+ + +
saveDirectory
+

Name of the folder where all the outputs will written to.

+ +
+
+

Value

+ + +

A data frame with the following columns:

analysisIdThe unique identifier +for a set of analysis choices.
targetIdThe ID of the target cohort populations.
outcomeIdThe ID of the outcomeId.
dataLocationThe location where the plpData was saved
the settings idsThe ids for all other settings used for model development.
+
+

Details

+

This function will run all specified prediction design diagnoses as defined using .

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/diagnosePlp.html b/docs/reference/diagnosePlp.html new file mode 100644 index 000000000..f4282a426 --- /dev/null +++ b/docs/reference/diagnosePlp.html @@ -0,0 +1,267 @@ + +diagnostic - Investigates the prediction problem settings - use before training a model — diagnosePlp • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine +whether the prediction problem is worth executing.

+
+ +
+
diagnosePlp(
+  plpData = NULL,
+  outcomeId,
+  analysisId,
+  populationSettings,
+  splitSettings = createDefaultSplitSetting(),
+  sampleSettings = createSampleSettings(),
+  saveDirectory = NULL,
+  featureEngineeringSettings = createFeatureEngineeringSettings(),
+  modelSettings = setLassoLogisticRegression(),
+  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+    "diagnosePlp Log"),
+  preprocessSettings = createPreprocessSettings()
+)
+
+ +
+

Arguments

+
plpData
+

An object of type plpData - the patient level prediction +data extracted from the CDM. Can also include an initial population as +plpData$popualtion.

+ + +
outcomeId
+

(integer) The ID of the outcome.

+ + +
analysisId
+

(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

+ + +
populationSettings
+

An object of type populationSettings created using createStudyPopulationSettings that +specifies how the data class labels are defined and addition any exclusions to apply to the +plpData cohort

+ + +
splitSettings
+

An object of type splitSettings that specifies how to split the data into train/validation/test. +The default settings can be created using createDefaultSplitSetting.

+ + +
sampleSettings
+

An object of type sampleSettings that specifies any under/over sampling to be done. +The default is none.

+ + +
saveDirectory
+

The path to the directory where the results will be saved (if NULL uses working directory)

+ + +
featureEngineeringSettings
+

An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

+ + +
modelSettings
+

An object of class modelSettings created using one of the function:

  • setLassoLogisticRegression() A lasso logistic regression model

  • +
  • setGradientBoostingMachine() A gradient boosting machine

  • +
  • setAdaBoost() An ada boost model

  • +
  • setRandomForest() A random forest model

  • +
  • setDecisionTree() A decision tree model

  • +
  • setKNN() A KNN model

  • +
+ + +
logSettings
+

An object of logSettings created using createLogSettings +specifying how the logging is done

+ + +
preprocessSettings
+

An object of preprocessSettings. This setting specifies the minimum fraction of +target population who must have a covariate for it to be included in the model training +and whether to normalise the covariates before training

+ +
+
+

Value

+ + +

An object containing the model or location where the model is save, the data selection settings, the preprocessing +and training settings as well as various performance measures obtained by the model.

+
distribution
+

list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index)

+ +
incident
+

list for each O of incidence of O in T during TAR

+ +
characterization
+

list for each O of Characterization of T, TnO, Tn~O

+ +
+
+

Details

+

Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as +follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, +observation time distribution.

+
+ +
+

Examples

+
if (FALSE) {
+#******** EXAMPLE 1 ********* 
+} 
+
+
+
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/diagnostic.html b/docs/reference/diagnostic.html deleted file mode 100644 index b66905ad0..000000000 --- a/docs/reference/diagnostic.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - - -diagnostic - Investigates the prediction problem settings - use before training a model — diagnostic • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine -whether the prediction problem is worth executing.

-
- -
diagnostic(
-  plpData = NULL,
-  cdmDatabaseName = "none",
-  cohortName,
-  outcomeNames,
-  databaseDetails,
-  restrictPlpDataSettings,
-  populationSettings,
-  outputFolder = NULL,
-  minCellCount = 5
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpData

The data object to do the diagnostic on - if NULL you need to specify the connection settings below

cdmDatabaseName

The name of the database being diagnosed

cohortName

Name of the target cohort

outcomeNames

Vector of outcome names

databaseDetails

(only used is plpData is NULL) The database details created using createDatabaseDetails

restrictPlpDataSettings

(only used is plpData is NULL) The restrictPlpDataSettings created using createRestrictPlpDataSettings

populationSettings

The population setting details created using createPopulationSettings

outputFolder

Location to save results for shiny app

minCellCount

The minimum count that will be displayed

- -

Value

- -

An object containing the model or location where the model is save, the data selection settings, the preprocessing -and training settings as well as various performance measures obtained by the model.

-
distribution

list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index)

-
incident

list for each O of incidence of O in T during TAR

-
characterization

list for each O of Characterization of T, TnO, Tn~O

- -

Details

- -

Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as -follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, -observation time distribution.

- -

Examples

-
if (FALSE) { -#******** EXAMPLE 1 ********* -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/diagnosticOddsRatio.html b/docs/reference/diagnosticOddsRatio.html index d6c34055f..806f33f02 100644 --- a/docs/reference/diagnosticOddsRatio.html +++ b/docs/reference/diagnosticOddsRatio.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the diagnostic odds ratio — diagnosticOddsRatio • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the diagnostic odds ratio — diagnosticOddsRatio • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the diagnostic odds ratio

Calculate the diagnostic odds ratio

-
diagnosticOddsRatio(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

diagnosticOddsRatio value

-

Details

+
+
diagnosticOddsRatio(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

diagnosticOddsRatio value

+
+
+

Details

Calculate the diagnostic odds ratio

+
+
-
- +
- - + + diff --git a/docs/reference/drawAttritionDiagramPlp.html b/docs/reference/drawAttritionDiagramPlp.html deleted file mode 100644 index 13bdca5d4..000000000 --- a/docs/reference/drawAttritionDiagramPlp.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Draw the attrition diagram — drawAttritionDiagramPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

drawAttritionDiagramPlp draws the attition diagram, showing how many people were excluded from -the study population, and for what reasons.

-
- -
drawAttritionDiagramPlp(
-  attrition,
-  targetLabel = "Target Population",
-  outcomeLabel = "Outcome Count",
-  fileName = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
attrition

The table of attrition details return from the population attr(popualtion, 'metaData')$attrition

targetLabel

A label to us for the treated cohort.

outcomeLabel

A label to us for the comparator cohort.

fileName

Name of the file where the plot should be saved, for example 'plot.png'. -See the function ggsave in the ggplot2 package for supported file -formats.

- -

Value

- -

A ggplot object. Use the ggsave function to save to file in a different -format.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/evaluateExistingModel.html b/docs/reference/evaluateExistingModel.html deleted file mode 100644 index 256889239..000000000 --- a/docs/reference/evaluateExistingModel.html +++ /dev/null @@ -1,347 +0,0 @@ - - - - - - - - -evaluateExistingModel — evaluateExistingModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

This function implements an existing model

- -
- -
evaluateExistingModel(modelTable, covariateTable, interceptTable = NULL,
-  type = "score", covariateSettings, customCovariates = NULL,
-  addExposureDaysToStart = F, riskWindowStart = 1,
-  addExposureDaysToEnd = F, riskWindowEnd = 365, requireTimeAtRisk = T,
-  minTimeAtRisk = 364, includeAllOutcomes = T,
-  removeSubjectsWithPriorOutcome = T, priorOutcomeLookback = 99999,
-  verbosity = "INFO", washoutPeriod = 0, firstExposureOnly = F,
-  binary = T, connectionDetails, cdmDatabaseSchema, cohortDatabaseSchema,
-  cohortTable, cohortId, outcomeDatabaseSchema, outcomeTable, outcomeId,
-  oracleTempSchema = cdmDatabaseSchema, modelName = "existingModel",
-  calibrationPopulation = NULL, covariateSummary = T, cdmVersion = 5)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
modelTable

The model covariates and scores

covariateTable

The mapping from model covariates to standard covariates

interceptTable

The model intercepts

type

Model type (score or logistic)

covariateSettings

The standard covariate settings (specify covariate lookback time)

customCovariates

A table of covariateId, sql (sql creates the custom covariate)

addExposureDaysToStart

riskWindowStart relative to the cohort end date instead of the cohort start date?

riskWindowStart

The day after index to start predicting the outcome

addExposureDaysToEnd

riskWindowEnd relative to the cohort end date instead of the cohort start date?

riskWindowEnd

The day after index to stop predicting the outcome

requireTimeAtRisk

Do you want to ignore people who leave the database some point between the riskWindowStart and riskWindowEnd

minTimeAtRisk

If requireTimeAtRisk is TRUE, how many days must they be observed before leaving to get included (default recommendation is all risk period: riskWindowEnd-riskWindowStart)

includeAllOutcomes

Setting this to TRUE means people with the outcome who leave the data during the risk period are still included, so only non-outcome people who leave during the risk period are removed

removeSubjectsWithPriorOutcome

Remove people from the target population if they have the outcome prior to target cohort start date

priorOutcomeLookback

Lookback for removeSubjectsWithPriorOutcome

verbosity

The study population creation verbosity

washoutPeriod

Remove patients from the population with less than washoutPeriod of days prior observation

firstExposureOnly

If patients are in the target population multiple times, use only the first date

binary

Binary classificsation (T or F)

connectionDetails

The details to connect to the CDM

cdmDatabaseSchema

A string specifying the database containing the cdm

cohortDatabaseSchema

A string specifying the database containing the target cohorts

cohortTable

A string specifying the table containing the target cohorts

cohortId

An iteger specifying the cohort id for the target cohorts

outcomeDatabaseSchema

A string specifying the database containing the outcome cohorts

outcomeTable

A string specifying the table containing the outcome cohorts

outcomeId

An iteger specifying the cohort id for the outcome cohorts

oracleTempSchema

The temp oracle schema

modelName

The name of the model

calibrationPopulation

A data.frame of subjectId, cohortStartDate, indexes used to recalibrate the model on new data

covariateSummary

Whether to calculate the covariateSummary

cdmVersion

The CDM version being used

- -

Value

- -

The performance of the existing model and prediction

- -

Details

- -

Implements an existing model and evaluates its performance

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/evaluateMultiplePlp.html b/docs/reference/evaluateMultiplePlp.html deleted file mode 100644 index 63876ebea..000000000 --- a/docs/reference/evaluateMultiplePlp.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - - -externally validate the multiple plp models across new datasets — evaluateMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

This function loads all the models in a multiple plp analysis folder and -validates the models on new data

-
- -
evaluateMultiplePlp(
-  analysesLocation,
-  outputLocation,
-  connectionDetails,
-  validationSchemaTarget,
-  validationSchemaOutcome,
-  validationSchemaCdm,
-  databaseNames,
-  validationTableTarget,
-  validationTableOutcome,
-  validationIdTarget = NULL,
-  validationIdOutcome = NULL,
-  oracleTempSchema = NULL,
-  verbosity = "INFO",
-  keepPrediction = F,
-  recalibrate = NULL,
-  sampleSize = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
analysesLocation

The location where the multiple plp analyses are

outputLocation

The location to save to validation results

connectionDetails

The connection details for extracting the new data

validationSchemaTarget

A string or list of strings specifying the database containing the target cohorts

validationSchemaOutcome

A string or list of strings specifying the database containing the outcome cohorts

validationSchemaCdm

A string or list of strings specifying the database containing the cdm

databaseNames

A string of lift of strings specifying sharing friendly database names corresponding to validationSchemaCdm

validationTableTarget

A string or list of strings specifying the table containing the target cohorts

validationTableOutcome

A string or list of strings specifying the table containing the outcome cohorts

validationIdTarget

An iteger or list of integers specifying the cohort id for the target cohorts

validationIdOutcome

An iteger or list of integers specifying the cohort id for the outcome cohorts

oracleTempSchema

The temp oracle schema requires read/write

verbosity

Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

    -
  • DEBUGHighest verbosity showing all debug statements

  • -
  • TRACEShowing information about start and end of steps

  • -
  • INFOShow informative information (Default)

  • -
  • WARNShow warning messages

  • -
  • ERRORShow error messages

  • -
  • FATALBe silent except for fatal errors

  • -
keepPrediction

Whether to keep the predicitons for the new data

recalibrate

A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')

sampleSize

If not NULL, the number of people to sample from the target cohort

- -

Details

- -

Users need to input a location where the results of the multiple plp analyses -are found and the connection and database settings for the new data

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/evaluatePlp.html b/docs/reference/evaluatePlp.html index 1060d323c..7ad688156 100644 --- a/docs/reference/evaluatePlp.html +++ b/docs/reference/evaluatePlp.html @@ -1,67 +1,12 @@ - - - - - - - -evaluatePlp — evaluatePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -evaluatePlp — evaluatePlp • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,54 +131,53 @@

evaluatePlp

Evaluates the performance of the patient level prediction model

-
evaluatePlp(prediction, typeColumn = "evaluationType")
+
+
evaluatePlp(prediction, typeColumn = "evaluationType")
+
-

Arguments

- - - - - - - - - - -
prediction

The patient level prediction model's prediction

typeColumn

The column name in the prediction object that is used to -stratify the evaluation

+
+

Arguments

+
prediction
+

The patient level prediction model's prediction

-

Value

-

A list containing the performance values

-

Details

+
typeColumn
+

The column name in the prediction object that is used to +stratify the evaluation

+
+
+

Value

+ + +

A list containing the performance values

+
+
+

Details

The function calculates various metrics to measure the performance of the model

+
+
-
- +
- - + + diff --git a/docs/reference/exportPlpDataToCsv.html b/docs/reference/exportPlpDataToCsv.html deleted file mode 100644 index 32eabfb24..000000000 --- a/docs/reference/exportPlpDataToCsv.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - -Export all data in a plpData object to CSV files — exportPlpDataToCsv • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Export all data in a plpData object to CSV files

- -
- -
exportPlpDataToCsv(plpData, outputFolder)
- -

Arguments

- - - - - - - - - - -
plpData

An object of type plpData.

outputFolder

The folder on the file system where the CSV files will be created. If the -folder does not yet exist it will be created.

- -

Details

- -

Created a set of CSV files in the output folder with all the data in the plplData object. This -function is intended to be used for research into prediction methods. The following files will be -created:

-
cohort.csv

Listing all persons and their prediction periods. This file -will have these fields: row_id (a unique ID per period), person_id, cohort_start_date, cohort_id, -time (number of days in the window).

outcomes.csv

Listing all outcomes per period. This -file will have these fields: row_id, outcome_id, outcome_count, time_to_event.

-
exclude.csv

Either not exported or a file listing per outcome ID which windows had the -outcome prior to the window and should therefore be removed prior to fitting the model. This object -will have these fields: rowId, outcomeId.

covariates.csv

Listing the baseline covariates -per person in the cohorts. This is done using a sparse representation: covariates with a value of 0 -are omitted to save space. The covariates file will have three columns: rowId, covariateId, and -covariateValue.

covariateRef.csv

A file describing the covariates that have been -extracted.

metaData

Some information on how the plpData object was constructed.

- - -

Examples

-
# NOT RUN {
-exportPlpDataToCsv(plpData, "s:/temp/exportTest")
-# }
-
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/externalValidateDbPlp.html b/docs/reference/externalValidateDbPlp.html index 8c8dd4311..79f46b721 100644 --- a/docs/reference/externalValidateDbPlp.html +++ b/docs/reference/externalValidateDbPlp.html @@ -1,67 +1,12 @@ - - - - - - - -externalValidateDbPlp - Validate a model on new databases — externalValidateDbPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -externalValidateDbPlp - Validate a model on new databases — externalValidateDbPlp • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,78 +131,77 @@

externalValidateDbPlp - Validate a model on new databases

This function extracts data using a user specified connection and cdm_schema, applied the model and then calcualtes the performance

-
externalValidateDbPlp(
-  plpModel,
-  validationDatabaseDetails = createDatabaseDetails(),
-  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
-  settings = createValidationSettings(recalibrate = "weakRecalibration"),
-  logSettings = createLogSettings(verbosity = "INFO", logName = "validatePLP"),
-  outputFolder = getwd()
-)
+
+
externalValidateDbPlp(
+  plpModel,
+  validationDatabaseDetails = createDatabaseDetails(),
+  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
+  settings = createValidationSettings(recalibrate = "weakRecalibration"),
+  logSettings = createLogSettings(verbosity = "INFO", logName = "validatePLP"),
+  outputFolder = getwd()
+)
+
+ +
+

Arguments

+
plpModel
+

The model object returned by runPlp() containing the trained model

+ + +
validationDatabaseDetails
+

A list of objects of class databaseDetails created using createDatabaseDetails

+ + +
validationRestrictPlpDataSettings
+

A list of population restriction settings created by createRestrictPlpDataSettings()

+ + +
settings
+

A settings object of class validationSettings created using createValidationSettings

-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
plpModel

The model object returned by runPlp() containing the trained model

validationDatabaseDetails

A list of objects of class databaseDetails created using createDatabaseDetails

validationRestrictPlpDataSettings

A list of population restriction settings created by createRestrictPlpDataSettings()

settings

A settings object of class validationSettings created using createValidationSettings

logSettings

An object of logSettings created using createLogSettings -specifying how the logging is done

outputFolder

The directory to save the validation results to (subfolders are created per database in validationDatabaseDetails)

-

Value

+
logSettings
+

An object of logSettings created using createLogSettings +specifying how the logging is done

-

A list containing the performance for each validation_schema

-

Details

-

Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the +

outputFolder
+

The directory to save the validation results to (subfolders are created per database in validationDatabaseDetails)

+ +
+
+

Value

+ + +

A list containing the performance for each validation_schema

+
+
+

Details

+

Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the number of cdm_schemas input with the performance on the new data

+
+
-
- +
- - + + diff --git a/docs/reference/externalValidatePlp.html b/docs/reference/externalValidatePlp.html deleted file mode 100644 index 51f1df281..000000000 --- a/docs/reference/externalValidatePlp.html +++ /dev/null @@ -1,310 +0,0 @@ - - - - - - - - -externalValidatePlp - Validate a model on new databases — externalValidatePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

This function extracts data using a user specified connection and cdm_schema, applied the model and then calcualtes the performance

-
- -
externalValidatePlp(
-  plpResult,
-  connectionDetails,
-  validationSchemaTarget,
-  validationSchemaOutcome,
-  validationSchemaCdm,
-  databaseNames,
-  validationTableTarget = "cohort",
-  validationTableOutcome = "cohort",
-  validationIdTarget = NULL,
-  validationIdOutcome = NULL,
-  oracleTempSchema = NULL,
-  verbosity = "INFO",
-  keepPrediction = F,
-  recalibrate = NULL,
-  sampleSize = NULL,
-  outputFolder
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

The object returned by runPlp() containing the trained model

connectionDetails

The connection details for extracting the new data

validationSchemaTarget

A string or vector of strings specifying the database containing the target cohorts

validationSchemaOutcome

A string or vector of strings specifying the database containing the outcome cohorts

validationSchemaCdm

A string or vector of strings specifying the database containing the cdm

databaseNames

A string of vector of strings specifying sharing friendly database names corresponding to validationSchemaCdm

validationTableTarget

A string or vector of strings specifying the table containing the target cohorts

validationTableOutcome

A string or vector of strings specifying the table containing the outcome cohorts

validationIdTarget

An iteger specifying the cohort id for the target cohort

validationIdOutcome

An iteger specifying the cohort id for the outcome cohort

oracleTempSchema

The temp oracle schema requires read/write

verbosity

Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:

    -
  • DEBUGHighest verbosity showing all debug statements

  • -
  • TRACEShowing information about start and end of steps

  • -
  • INFOShow informative information (Default)

  • -
  • WARNShow warning messages

  • -
  • ERRORShow error messages

  • -
  • FATALBe silent except for fatal errors

  • -
keepPrediction

Whether to keep the predicitons for the new data

recalibrate

A vector of characters specifying the recalibration method to apply

sampleSize

If not NULL, the number of people to sample from the target cohort

outputFolder

If you want to save the results enter the directory to save here

- -

Value

- -

A list containing the performance for each validation_schema

-

Details

- -

Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the -number of cdm_schemas input with the performance on the new data

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/extractDatabaseToCsv.html b/docs/reference/extractDatabaseToCsv.html new file mode 100644 index 000000000..8e90243ea --- /dev/null +++ b/docs/reference/extractDatabaseToCsv.html @@ -0,0 +1,204 @@ + +Exports all the results from a database into csv files — extractDatabaseToCsv • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Exports all the results from a database into csv files

+
+ +
+
extractDatabaseToCsv(
+  conn = NULL,
+  connectionDetails,
+  databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+  csvFolder,
+  minCellCount = 5,
+  sensitiveColumns = getPlpSensitiveColumns(),
+  fileAppend = NULL
+)
+
+ +
+

Arguments

+
conn
+

The connection to the database with the results

+ + +
connectionDetails
+

The connectionDetails for the result database

+ + +
databaseSchemaSettings
+

The result database schema settings

+ + +
csvFolder
+

Location to save the csv files

+ + +
minCellCount
+

The min value to show in cells that are sensitive (values less than this value will be replaced with -1)

+ + +
sensitiveColumns
+

A named list (name of table columns belong to) with a list of columns to apply the minCellCount to.

+ + +
fileAppend
+

If set to a string this will be appended to the start of the csv file names

+ +
+
+

Details

+

Extracts the results from a database into a set of csv files

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/f1Score.html b/docs/reference/f1Score.html index 2f25b115f..0a2fb3974 100644 --- a/docs/reference/f1Score.html +++ b/docs/reference/f1Score.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the f1Score — f1Score • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the f1Score — f1Score • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the f1Score

Calculate the f1Score

-
f1Score(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

f1Score value

-

Details

+
+
f1Score(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

f1Score value

+
+
+

Details

Calculate the f1Score

+
+
-
- +
- - + + diff --git a/docs/reference/falseDiscoveryRate.html b/docs/reference/falseDiscoveryRate.html index 5b2a7df58..73f4dc46b 100644 --- a/docs/reference/falseDiscoveryRate.html +++ b/docs/reference/falseDiscoveryRate.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the falseDiscoveryRate — falseDiscoveryRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the falseDiscoveryRate — falseDiscoveryRate • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the falseDiscoveryRate

Calculate the falseDiscoveryRate

-
falseDiscoveryRate(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

falseDiscoveryRate value

-

Details

+
+
falseDiscoveryRate(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

falseDiscoveryRate value

+
+
+

Details

Calculate the falseDiscoveryRate

+
+
-
- +
- - + + diff --git a/docs/reference/falseNegativeRate.html b/docs/reference/falseNegativeRate.html index b58d5a5ec..dc984b78f 100644 --- a/docs/reference/falseNegativeRate.html +++ b/docs/reference/falseNegativeRate.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the falseNegativeRate — falseNegativeRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the falseNegativeRate — falseNegativeRate • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the falseNegativeRate

Calculate the falseNegativeRate

-
falseNegativeRate(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

falseNegativeRate value

-

Details

+
+
falseNegativeRate(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

falseNegativeRate value

+
+
+

Details

Calculate the falseNegativeRate

+
+
-
- +
- - + + diff --git a/docs/reference/falseOmissionRate.html b/docs/reference/falseOmissionRate.html index 254724bd1..7cf484107 100644 --- a/docs/reference/falseOmissionRate.html +++ b/docs/reference/falseOmissionRate.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the falseOmissionRate — falseOmissionRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the falseOmissionRate — falseOmissionRate • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the falseOmissionRate

Calculate the falseOmissionRate

-
falseOmissionRate(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

falseOmissionRate value

-

Details

+
+
falseOmissionRate(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

falseOmissionRate value

+
+
+

Details

Calculate the falseOmissionRate

+
+
-
- +
- - + + diff --git a/docs/reference/falsePositiveRate.html b/docs/reference/falsePositiveRate.html index 8e25c27bc..0f378c3c8 100644 --- a/docs/reference/falsePositiveRate.html +++ b/docs/reference/falsePositiveRate.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the falsePositiveRate — falsePositiveRate • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the falsePositiveRate — falsePositiveRate • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the falsePositiveRate

Calculate the falsePositiveRate

-
falsePositiveRate(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

falsePositiveRate value

-

Details

+
+
falsePositiveRate(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

falsePositiveRate value

+
+
+

Details

Calculate the falsePositiveRate

+
+
-
- +
- - + + diff --git a/docs/reference/fitGLMModel.html b/docs/reference/fitGLMModel.html deleted file mode 100644 index 69c0fd178..000000000 --- a/docs/reference/fitGLMModel.html +++ /dev/null @@ -1,258 +0,0 @@ - - - - - - - - -Fit a predictive model — fitGLMModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Fit a predictive model

-
- -
fitGLMModel(
-  population,
-  plpData,
-  modelType = "logistic",
-  excludeCovariateIds = c(),
-  includeCovariateIds = c(),
-  prior = Cyclops::createPrior("laplace", useCrossValidation = TRUE),
-  control = Cyclops::createControl(cvType = "auto", fold = 3, startingVariance = 0.01,
-    lowerLimit = 0.01, upperLimit = 20, tolerance = 2e-06, cvRepetitions = 1,
-    selectorType = "byPid", noiseLevel = "silent", threads = -1, maxIterations = 3000)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
population

A population object generated by createStudyPopulation, potentially filtered by other functions.

plpData

An object of type plpData as generated using -getPlpData.

modelType

The type of outcome model that will be used. Possible values are -"logistic", "poisson", or "cox".

excludeCovariateIds

Exclude these covariates from the outcome model.

includeCovariateIds

Include only these covariates in the outcome model.

prior

The prior used to fit the model. See -createPrior for details.

control

The control object used to control the cross-validation used to -determine the hyperparameters of the prior (if applicable). See -createControl for details.

- - -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/fitPlp.html b/docs/reference/fitPlp.html index c8d40ca24..6a394ecec 100644 --- a/docs/reference/fitPlp.html +++ b/docs/reference/fitPlp.html @@ -1,67 +1,12 @@ - - - - - - - -fitPlp — fitPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -fitPlp — fitPlp • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,77 +131,88 @@

fitPlp

Train various models using a default parameter gird search or user specified parameters

-
fitPlp(trainData, modelSettings, search = "grid", analysisId)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
trainData

An object of type TrainData created using splitData -data extracted from the CDM.

modelSettings

An object of class modelSettings created using one of the function:

    -
  • logisticRegressionModel() A lasso logistic regression model

  • -
  • GBMclassifier() A gradient boosting machine

  • -
  • RFclassifier() A random forest model

  • -
  • GLMclassifier () A generalised linear model

  • -
  • KNNclassifier() A KNN model

  • -
search

The search strategy for the hyper-parameter selection (currently not used)

analysisId

The id of the analysis

- -

Value

- -

An object of class plpModel containing:

-
model

The trained prediction model

-
modelLoc

The path to where the model is saved (if saved)

-
trainAuc

The AUC obtained on the training set

-
trainCalibration

The calibration obtained on the training set

-
modelSettings

A list specifiying the model, preprocessing, outcomeId and cohortId

-
metaData

The model meta data

-
trainingTime

The time taken to train the classifier

- -

Details

+
+
fitPlp(trainData, modelSettings, search = "grid", analysisId, analysisPath)
+
+ +
+

Arguments

+
trainData
+

An object of type TrainData created using splitData +data extracted from the CDM.

+ + +
modelSettings
+

An object of class modelSettings created using one of the function:

  • setLassoLogisticRegression() A lasso logistic regression model

  • +
  • setGradientBoostingMachine() A gradient boosting machine

  • +
  • setRandomForest() A random forest model

  • +
  • setKNN() A KNN model

  • +
+ + +
search
+

The search strategy for the hyper-parameter selection (currently not used)

+ +
analysisId
+

The id of the analysis

+ + +
analysisPath
+

The path of the analysis

+ +
+
+

Value

+ + +

An object of class plpModel containing:

+
model
+

The trained prediction model

+ +
preprocessing
+

The preprocessing required when applying the model

+ +
prediction
+

The cohort data.frame with the predicted risk column added

+ +
modelDesign
+

A list specifiying the modelDesign settings used to fit the model

+ +
trainDetails
+

The model meta data

+ +
covariateImportance
+

The covariate importance for the model

+ +
+
+

Details

The user can define the machine learning model to train (regularised logistic regression, random forest, gradient boosting machine, neural network and )

+
+
-
- +
- - + + diff --git a/docs/reference/getAttritionTable.html b/docs/reference/getAttritionTable.html deleted file mode 100644 index 34c3b36c1..000000000 --- a/docs/reference/getAttritionTable.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - - - - - -Get the attrition table for a population — getAttritionTable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Get the attrition table for a population

-
- -
getAttritionTable(object)
- -

Arguments

- - - - - - -
object

Either an object of type plpData, a population object generated by functions -like createStudyPopulation, or an object of type outcomeModel.

- -

Value

- -

A data frame specifying the number of people and exposures in the population after specific steps of filtering.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.5.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/getCalibration.html b/docs/reference/getCalibration.html deleted file mode 100644 index 938f1c319..000000000 --- a/docs/reference/getCalibration.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - -Get a sparse summary of the calibration — getCalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Get a sparse summary of the calibration

-
- -
getCalibration(prediction, numberOfStrata = 10, truncateFraction = 0.01)
- -

Arguments

- - - - - - - - - - - - - - -
prediction

A prediction object as generated using the -predict functions.

numberOfStrata

The number of strata in the plot.

truncateFraction

This fraction of probability values will be ignored when plotting, to -avoid the x-axis scale being dominated by a few outliers.

- -

Value

- -

A dataframe with the calibration summary

-

Details

- -

Generates a sparse summary showing the predicted probabilities and the observed fractions. Predictions are -stratefied into equally sized bins of predicted probabilities.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/getCalibrationSummary.html b/docs/reference/getCalibrationSummary.html index 03ff5060e..f32e764d0 100644 --- a/docs/reference/getCalibrationSummary.html +++ b/docs/reference/getCalibrationSummary.html @@ -1,67 +1,12 @@ - - - - - - - -Get a sparse summary of the calibration — getCalibrationSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get a sparse summary of the calibration — getCalibrationSummary • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,74 +131,73 @@

Get a sparse summary of the calibration

Get a sparse summary of the calibration

-
getCalibrationSummary(
-  prediction,
-  predictionType,
-  typeColumn = "evaluation",
-  numberOfStrata = 100,
-  truncateFraction = 0.05
-)
+
+
getCalibrationSummary(
+  prediction,
+  predictionType,
+  typeColumn = "evaluation",
+  numberOfStrata = 100,
+  truncateFraction = 0.05
+)
+
+ +
+

Arguments

+
prediction
+

A prediction object as generated using the +predict functions.

+ + +
predictionType
+

The type of prediction (binary or survival)

+ + +
typeColumn
+

A column that is used to stratify the results

-

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
prediction

A prediction object as generated using the -predict functions.

predictionType

The type of prediction (binary or survival)

typeColumn

A column that is used to stratify the results

numberOfStrata

The number of strata in the plot.

truncateFraction

This fraction of probability values will be ignored when plotting, to -avoid the x-axis scale being dominated by a few outliers.

-

Value

+
numberOfStrata
+

The number of strata in the plot.

-

A dataframe with the calibration summary

-

Details

+
truncateFraction
+

This fraction of probability values will be ignored when plotting, to +avoid the x-axis scale being dominated by a few outliers.

+ +
+
+

Value

+ + +

A dataframe with the calibration summary

+
+
+

Details

Generates a sparse summary showing the predicted probabilities and the observed fractions. Predictions are stratefied into equally sized bins of predicted probabilities.

+
+
-
- +
- - + + diff --git a/docs/reference/getCohortCovariateData.html b/docs/reference/getCohortCovariateData.html index 43ae2546e..b5dcda0a1 100644 --- a/docs/reference/getCohortCovariateData.html +++ b/docs/reference/getCohortCovariateData.html @@ -1,67 +1,12 @@ - - - - - - - -Extracts covariates based on cohorts — getCohortCovariateData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Extracts covariates based on cohorts — getCohortCovariateData • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,92 +131,96 @@

Extracts covariates based on cohorts

Extracts covariates based on cohorts

-
getCohortCovariateData(
-  connection,
-  oracleTempSchema = NULL,
-  cdmDatabaseSchema,
-  cdmVersion = "5",
-  cohortTable = "#cohort_person",
-  rowIdField = "row_id",
-  aggregated,
-  cohortId,
-  covariateSettings
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
connection

The database connection

oracleTempSchema

The temp schema if using oracle

cdmDatabaseSchema

The schema of the OMOP CDM data

cdmVersion

version of the OMOP CDM data

cohortTable

the table name that contains the target population cohort

rowIdField

string representing the unique identifier in the target population cohort

aggregated

whether the covariate should be aggregated

cohortId

cohort id for the target population cohort

covariateSettings

settings for the covariate cohorts and time periods

- -

Value

- -

The models will now be in the package

-

Details

+
+
getCohortCovariateData(
+  connection,
+  oracleTempSchema = NULL,
+  cdmDatabaseSchema,
+  cdmVersion = "5",
+  cohortTable = "#cohort_person",
+  rowIdField = "row_id",
+  aggregated,
+  cohortIds,
+  covariateSettings,
+  ...
+)
+
+ +
+

Arguments

+
connection
+

The database connection

+ + +
oracleTempSchema
+

The temp schema if using oracle

+ + +
cdmDatabaseSchema
+

The schema of the OMOP CDM data

+ + +
cdmVersion
+

version of the OMOP CDM data

+ + +
cohortTable
+

the table name that contains the target population cohort

+ +
rowIdField
+

string representing the unique identifier in the target population cohort

+ + +
aggregated
+

whether the covariate should be aggregated

+ + +
cohortIds
+

cohort id for the target cohort

+ + +
covariateSettings
+

settings for the covariate cohorts and time periods

+ + +
...
+

additional arguments from FeatureExtraction

+ +
+
+

Value

+ + +

The models will now be in the package

+
+
+

Details

The user specifies a cohort and time period and then a covariate is constructed whether they are in the cohort during the time periods relative to target population cohort index

+
+
-
- +
- - + + diff --git a/docs/reference/getCovariateData.html b/docs/reference/getCovariateData.html deleted file mode 100644 index e7ab32589..000000000 --- a/docs/reference/getCovariateData.html +++ /dev/null @@ -1,242 +0,0 @@ - - - - - - - - -Get the covaridate data for a cohort table — getCovariateData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

This function executes some SQL to extract covaraite data for a cohort table

- -
- -
getCovariateData(connection, cdmDatabaseSchema,
-  oracleTempSchema = cdmDatabaseSchema, cohortTable = "#cohort_person",
-  cdmVersion = 5, covariateSettings)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
connection

Can also use an existing connection rather than the connectionDetails

cdmDatabaseSchema

The name of the database schema that contains the OMOP CDM -instance. Requires read permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

oracleTempSchema

For Oracle only: the name of the database schema where you want -all temporary tables to be managed. Requires create/insert -permissions to this database.

cohortTable

The temp table containing the cohort of people

cdmVersion

The version of the CDM (default 5)

covariateSettings

An object of type covariateSettings as created using the -createCovariateSettings function in the -FeatureExtraction package.

- -

Value

- -

Returns the covariates for the people in the temp table

- -

Details

- - - - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/getDemographicSummary.html b/docs/reference/getDemographicSummary.html index fc69358f0..a4ffe6cdc 100644 --- a/docs/reference/getDemographicSummary.html +++ b/docs/reference/getDemographicSummary.html @@ -1,67 +1,12 @@ - - - - - - - -Get a calibration per age/gender groups — getDemographicSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get a calibration per age/gender groups — getDemographicSummary • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,57 +131,56 @@

Get a calibration per age/gender groups

Get a calibration per age/gender groups

-
getDemographicSummary(prediction, predictionType, typeColumn = "evaluation")
- -

Arguments

- - - - - - - - - - - - - - -
prediction

A prediction object

predictionType

The type of prediction (binary or survival)

typeColumn

A column that is used to stratify the results

- -

Value

- -

A dataframe with the calibration summary

-

Details

+
+
getDemographicSummary(prediction, predictionType, typeColumn = "evaluation")
+
+ +
+

Arguments

+
prediction
+

A prediction object

+ + +
predictionType
+

The type of prediction (binary or survival)

+ +
typeColumn
+

A column that is used to stratify the results

+ +
+
+

Value

+ + +

A dataframe with the calibration summary

+
+
+

Details

Generates a data.frame with the calibration per each 5 year age group and gender group

+
+
-
- +
- - + + diff --git a/docs/reference/getModelDetails.html b/docs/reference/getModelDetails.html deleted file mode 100644 index 6d2e4a0cc..000000000 --- a/docs/reference/getModelDetails.html +++ /dev/null @@ -1,215 +0,0 @@ - - - - - - - - -Get the predictive model details — getModelDetails • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

getModelDetails shows the full model, so showing the betas of all variables included in the -model, along with the variable names

- -
- -
getModelDetails(predictiveModel, plpData)
- -

Arguments

- - - - - - - - - - -
predictiveModel

An object of type predictiveModel as generated using he -fitPlp function.

plpData

An object of type plpData as generated using -getPlpData.

- -

Details

- -

Shows the coefficients and names of the covariates with non-zero coefficients.

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/getPlpData.html b/docs/reference/getPlpData.html index 9464df6ad..88c691ae6 100644 --- a/docs/reference/getPlpData.html +++ b/docs/reference/getPlpData.html @@ -1,68 +1,13 @@ - - - - - - - -Get the patient level prediction data from the server — getPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get the patient level prediction data from the server — getPlpData • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -185,44 +133,55 @@

Get the patient level prediction data from the server

extract the data needed to perform the analysis.

-
getPlpData(databaseDetails, covariateSettings, restrictPlpDataSettings)
+
+
getPlpData(databaseDetails, covariateSettings, restrictPlpDataSettings)
+
+ +
+

Arguments

+
databaseDetails
+

The cdm database details created using createDatabaseDetails()

+ -

Arguments

- - - - - - - - - - - - - - -
databaseDetails

The cdm database details created using createDatabaseDetails()

covariateSettings

An object of type covariateSettings as created using the +

covariateSettings
+

An object of type covariateSettings as created using the createCovariateSettings function in the -FeatureExtraction package.

restrictPlpDataSettings

Extra settings to apply to the target population while extracting data. Created using createRestrictPlpDataSettings().

+FeatureExtraction package.

+ -

Value

+
restrictPlpDataSettings
+

Extra settings to apply to the target population while extracting data. Created using createRestrictPlpDataSettings().

-

Returns an object of type plpData, containing information on the cohorts, their +

+
+

Value

+ + +

Returns an object of type plpData, containing information on the cohorts, their outcomes, and baseline covariates. Information about multiple outcomes can be captured at once for -efficiency reasons. This object is a list with the following components:

-
outcomes

A data frame listing the outcomes per person, including the time to event, and +efficiency reasons. This object is a list with the following components:

outcomes
+

A data frame listing the outcomes per person, including the time to event, and the outcome id. Outcomes are not yet filtered based on risk window, since this is done at -a later stage.

cohorts

A data frame listing the persons in each cohort, listing their +a later stage.

+
cohorts
+

A data frame listing the persons in each cohort, listing their exposure status as well as the time to the end of the observation period and time to the end of the -cohort (usually the end of the exposure era).

covariates

An ffdf object listing the +cohort (usually the end of the exposure era).

+
covariates
+

An ffdf object listing the baseline covariates per person in the two cohorts. This is done using a sparse representation: -covariates with a value of 0 are omitted to save space.

covariateRef

An ffdf object describing the covariates that have been extracted.

-
metaData

A list of objects with information on how the cohortMethodData object was -constructed.

-

The generic () and summary() functions have been implemented for this object.

-

Details

- +covariates with a value of 0 are omitted to save space.

+
covariateRef
+

An ffdf object describing the covariates that have been extracted.

+ +
metaData
+

A list of objects with information on how the cohortMethodData object was +constructed.

+ +

The generic () and summary() functions have been implemented for this object.

+
+
+

Details

Based on the arguments, the at risk cohort data is retrieved, as well as outcomes occurring in these subjects. The at risk cohort is identified through user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema. @@ -232,32 +191,29 @@

Details If you wish to exclude concepts from covariates you will need to manually add the concept_ids and descendants to the excludedCovariateConceptIds of the covariateSettings argument.

+

+
-
- +

- - + + diff --git a/docs/reference/getPlpTable.html b/docs/reference/getPlpTable.html deleted file mode 100644 index 92a51f7b7..000000000 --- a/docs/reference/getPlpTable.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - - - -Create a dataframe with the summary details of the population cohort for publications — getPlpTable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create a dataframe with the summary details of the population cohort for publications

-
- -
getPlpTable(
-  cdmDatabaseSchema,
-  oracleTempSchema,
-  covariateSettings,
-  longTermStartDays = -365,
-  population,
-  connectionDetails,
-  cohortTable = "#temp_person"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cdmDatabaseSchema

The schema containing the OMOP CDM data

oracleTempSchema

The oracle schema if needed

covariateSettings

The covariateSettings if different from default

longTermStartDays

How far to look back when looking for the variables in the data

population

The population you want the summary table for

connectionDetails

The connection details used to connect to the CDM database

cohortTable

The name of the temp table that will store the popualtion cohort

- -

Details

- -

This function is used to create a summary table for population to be inserted into publications

- -

Examples

-
if (FALSE) { -getTable1 (plpData, population, connectionDetails) -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/getPredictionDistribution.html b/docs/reference/getPredictionDistribution.html index e2a3887d6..f6d9eac2f 100644 --- a/docs/reference/getPredictionDistribution.html +++ b/docs/reference/getPredictionDistribution.html @@ -1,67 +1,12 @@ - - - - - - - -Calculates the prediction distribution — getPredictionDistribution • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculates the prediction distribution — getPredictionDistribution • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,62 +131,61 @@

Calculates the prediction distribution

Calculates the prediction distribution

-
getPredictionDistribution(
-  prediction,
-  predictionType,
-  typeColumn = "evaluation"
-)
- -

Arguments

- - - - - - - - - - - - - - -
prediction

A prediction object

predictionType

The type of prediction (binary or survival)

typeColumn

A column that is used to stratify the results

- -

Value

- -

The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction, -the mean and standard deviation per class

-

Details

+
+
getPredictionDistribution(
+  prediction,
+  predictionType,
+  typeColumn = "evaluation"
+)
+
+ +
+

Arguments

+
prediction
+

A prediction object

+ + +
predictionType
+

The type of prediction (binary or survival)

+ + +
typeColumn
+

A column that is used to stratify the results

+
+
+

Value

+ + +

The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction, +the mean and standard deviation per class

+
+
+

Details

Calculates the quantiles from a predition object

+
+
-
- +
- - + + diff --git a/docs/reference/getPredictionDistribution_binary.html b/docs/reference/getPredictionDistribution_binary.html index 9d4ccbde5..25312a70d 100644 --- a/docs/reference/getPredictionDistribution_binary.html +++ b/docs/reference/getPredictionDistribution_binary.html @@ -1,67 +1,12 @@ - - - - - - - -Calculates the prediction distribution — getPredictionDistribution_binary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculates the prediction distribution — getPredictionDistribution_binary • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,58 +131,57 @@

Calculates the prediction distribution

Calculates the prediction distribution

-
getPredictionDistribution_binary(prediction, evalColumn, ...)
- -

Arguments

- - - - - - - - - - - - - - -
prediction

A prediction object

evalColumn

A column that is used to stratify the results

...

Other inputs

- -

Value

- -

The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction, -the mean and standard deviation per class

-

Details

+
+
getPredictionDistribution_binary(prediction, evalColumn, ...)
+
+ +
+

Arguments

+
prediction
+

A prediction object

+ + +
evalColumn
+

A column that is used to stratify the results

+ + +
...
+

Other inputs

+
+
+

Value

+ + +

The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction, +the mean and standard deviation per class

+
+
+

Details

Calculates the quantiles from a predition object

+
+
-
- +
- - + + diff --git a/docs/reference/getThresholdSummary.html b/docs/reference/getThresholdSummary.html index 9a1f6cc64..743440c53 100644 --- a/docs/reference/getThresholdSummary.html +++ b/docs/reference/getThresholdSummary.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate all measures for sparse ROC — getThresholdSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate all measures for sparse ROC — getThresholdSummary • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,58 +131,57 @@

Calculate all measures for sparse ROC

Calculate all measures for sparse ROC

-
getThresholdSummary(prediction, predictionType, typeColumn = "evaluation")
- -

Arguments

- - - - - - - - - - - - - - -
prediction

A prediction object

predictionType

The type of prediction (binary or survival)

typeColumn

A column that is used to stratify the results

- -

Value

- -

A data.frame with all the measures

-

Details

+
+
getThresholdSummary(prediction, predictionType, typeColumn = "evaluation")
+
+ +
+

Arguments

+
prediction
+

A prediction object

+ + +
predictionType
+

The type of prediction (binary or survival)

+ +
typeColumn
+

A column that is used to stratify the results

+ +
+
+

Value

+ + +

A data.frame with all the measures

+
+
+

Details

Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure from a prediction object

+
+
-
- +
- - + + diff --git a/docs/reference/getThresholdSummary_binary.html b/docs/reference/getThresholdSummary_binary.html index b45c0a63c..dd31368b0 100644 --- a/docs/reference/getThresholdSummary_binary.html +++ b/docs/reference/getThresholdSummary_binary.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate all measures for sparse ROC when prediction is bianry classification — getThresholdSummary_binary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate all measures for sparse ROC when prediction is bianry classification — getThresholdSummary_binary • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,58 +131,57 @@

Calculate all measures for sparse ROC when prediction is bianry classificati

Calculate all measures for sparse ROC when prediction is bianry classification

-
getThresholdSummary_binary(prediction, evalColumn, ...)
- -

Arguments

- - - - - - - - - - - - - - -
prediction

A prediction object

evalColumn

A column that is used to stratify the results

...

Other inputs

- -

Value

- -

A data.frame with all the measures

-

Details

+
+
getThresholdSummary_binary(prediction, evalColumn, ...)
+
+ +
+

Arguments

+
prediction
+

A prediction object

+ + +
evalColumn
+

A column that is used to stratify the results

+ +
...
+

Other inputs

+ +
+
+

Value

+ + +

A data.frame with all the measures

+
+
+

Details

Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure from a prediction object

+
+
-
- +
- - + + diff --git a/docs/reference/grepCovariateNames.html b/docs/reference/grepCovariateNames.html deleted file mode 100644 index 8438e0454..000000000 --- a/docs/reference/grepCovariateNames.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - - - - -Extract covariate names — grepCovariateNames • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Extracts covariate names using a regular-expression.

- -
- -
grepCovariateNames(pattern, object)
- -

Arguments

- - - - - - - - - - -
pattern

A regular expression with which to name covariate names

object

An R object of type plpData or covariateData.

- -

Value

- -

Returns a data.frame containing information about covariates that match a regular -expression. This data.frame has the following columns:

-
covariateId

Numerical identifier for use in model fitting using these covariates

-
covariateName

Text identifier

analysisId

Analysis identifier

conceptId

OMOP -common data model concept identifier, or 0

- - -

Details

- -

This function extracts covariate names that match a regular-expression for a -plpData or covariateData object.

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/ici.html b/docs/reference/ici.html index 449fcbe26..aefd353ec 100644 --- a/docs/reference/ici.html +++ b/docs/reference/ici.html @@ -1,70 +1,15 @@ - - - - - - - -Calculate the Integrated Calibration Information from Austin and Steyerberg -https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281 — ici • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the Integrated Calibration Information from Austin and Steyerberg +https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281 — ici • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -188,49 +136,48 @@

Calculate the Integrated Calibration Information from Austin and Steyerberg https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281

-
ici(prediction)
- -

Arguments

- - - - - - -
prediction

the prediction object found in the plpResult object

+
+
ici(prediction)
+
-

Value

+
+

Arguments

+
prediction
+

the prediction object found in the plpResult object

-

Integrated Calibration Information

-

Details

+
+
+

Value

+ +

Integrated Calibration Information

+
+
+

Details

Calculate the Integrated Calibration Information

+
+
-
- +

- - + + diff --git a/docs/reference/index.html b/docs/reference/index.html index 5c485e21f..652594c5b 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -1,66 +1,12 @@ - - - - - - - -Function reference • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Function reference • PatientLevelPrediction + + - - - - -
-
- -
- -
+
- - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
-

Extracting data from the OMOP CDM database

-

Functions for getting the necessary data from the database in Common Data Model and saving/loading.

+ - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - + - - - - - - - - - - - + - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - + + + + + + + + + + + + + + + + + - - - - - - - + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+

Extracting data from the OMOP CDM database

+

Functions for getting the necessary data from the database in Common Data Model and saving/loading.

+

createDatabaseDetails()

Create a setting that holds the details about the cdmDatabase connection for data extraction

+

createRestrictPlpDataSettings()

createRestrictPlpDataSettings define extra restriction settings when calling getPlpData

+

getPlpData()

Get the patient level prediction data from the server

+

savePlpData()

Save the cohort data to folder

+

loadPlpData()

Load the cohort data from a folder

-

Settings for designing a prediction models

-

Design settings required when developing a model.

+
+

getCohortCovariateData()

+

Extracts covariates based on cohorts

+

Settings for designing a prediction models

+

Design settings required when developing a model.

+

createStudyPopulationSettings()

create the study population settings

+

createDefaultSplitSetting()

Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting).

+default splitting functions (either random stratified by outcome, time or subject splitting)

createSampleSettings()

Create the settings for defining how the trainData from splitData are sampled using default sample functions.

+

createFeatureEngineeringSettings()

Create the settings for defining any feature engineering that will be done

+

createPreprocessSettings()

Create the settings for preprocessing the trainData using .

-

Execution settings when developing a model

-

Execution settings required when developing a model.

+

Create the settings for preprocessing the trainData.

+

Optional design settings

+

Settings for optional steps that can be used in the PLP pipeline

+
+

createCohortCovariateSettings()

+

Extracts covariates based on cohorts

+

createRandomForestFeatureSelection()

+

Create the settings for random foreat based feature selection

+

createUnivariateFeatureSelection()

+

Create the settings for defining any feature selection that will be done

+

createSplineSettings()

+

Create the settings for adding a spline for continuous variables

+

createStratifiedImputationSettings()

+

Create the settings for adding a spline for continuous variables

+

External validation

+

+
+

createValidationDesign()

+

createValidationDesign - Define the validation design for external validation

+

validateExternal()

+

externalValidatePlp - Validate model performance on new data

+

createValidationSettings()

+

createValidationSettings define optional settings for performing external validation

+

recalibratePlp()

+

recalibratePlp

+

recalibratePlpRefit()

+

recalibratePlpRefit

+

Execution settings when developing a model

+

Execution settings required when developing a model.

+

createLogSettings()

Create the settings for logging the progression of the analysis

+

createExecuteSettings()

Creates list of settings specifying what parts of runPlp to execute

-

Binary Classification Models

-

Functions for setting binary classifiers and their hyper-parameter search.

+
+

createDefaultExecuteSettings()

+

Creates default list of settings specifying what parts of runPlp to execute

+

Binary Classification Models

+

Functions for setting binary classifiers and their hyper-parameter search.

+

setAdaBoost()

Create setting for AdaBoost with python DecisionTreeClassifier base estimator

+

setDecisionTree()

Create setting for the scikit-learn 1.0.1 DecisionTree with python

+

setGradientBoostingMachine()

Create setting for gradient boosting machine model using gbm_xgboost implementation

+

setKNN()

Create setting for knn model

+

setLassoLogisticRegression()

Create setting for lasso logistic regression

+

setMLP()

Create setting for neural network model with python

+

setNaiveBayes()

Create setting for naive bayes model with python

+

setRandomForest()

Create setting for random forest model with python (very fast)

+

setSVM()

Create setting for the python sklearn SVM (SVC function)

-

Survival Models

-

Functions for setting survival models and their hyper-parameter search.

+
+

setIterativeHardThresholding()

+

Create setting for lasso logistic regression

+

setLightGBM()

+

Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package).

+

Survival Models

+

Functions for setting survival models and their hyper-parameter search.

+

setCoxModel()

Create setting for lasso Cox model

-

Single Patient-Level Prediction Model

-

Functions for training/evaluating/applying a single patient-level-prediction model

+
+

Single Patient-Level Prediction Model

+

Functions for training/evaluating/applying a single patient-level-prediction model

+

runPlp()

runPlp - Develop and internally evaluate a model using specified settings

+

externalValidateDbPlp()

externalValidateDbPlp - Validate a model on new databases

+

savePlpModel()

Saves the plp model

+

loadPlpModel()

loads the plp model

+

savePlpResult()

Saves the result from runPlp into the location directory

+

loadPlpResult()

Loads the evalaution dataframe

-

Multiple Patient-Level Prediction Models

-

Functions for training mutliple patient-level-prediction model in an efficient way.

+
+

diagnosePlp()

+

diagnostic - Investigates the prediction problem settings - use before training a model

+

Multiple Patient-Level Prediction Models

+

Functions for training mutliple patient-level-prediction model in an efficient way.

+

createModelDesign()

Specify settings for deceloping a single model

+

runMultiplePlp()

Run a list of predictions analyses

+

validateMultiplePlp()

externally validate the multiple plp models across new datasets

+

savePlpAnalysesJson()

Save the modelDesignList to a json file

+

loadPlpAnalysesJson()

Load the multiple prediction json settings from a file

-

Saving results into database

-

Functions for saving the prediction model and performances into a database.

+
+

diagnoseMultiplePlp()

+

Run a list of predictions diagnoses

+

Individual pipeline functions

+

Functions for running parts of the PLP workflow

+
+

createStudyPopulation()

+

Create a study population

+

splitData()

+

Split the plpData into test/train sets using a splitting settings of class splitSettings

+

preprocessData()

+

A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data +and remove rare or redundant features

+

fitPlp()

+

fitPlp

+

predictPlp()

+

predictPlp

+

evaluatePlp()

+

evaluatePlp

+

covariateSummary()

+

covariateSummary

+

Saving results into database

+

Functions for saving the prediction model and performances into a database.

+
+

insertResultsToSqlite()

+

Create sqlite database with the results

createPlpResultTables()

Create the results tables to store PatientLevelPrediction models and results into a database

-

populatePlpResultTables()

+
+

addMultipleRunPlpToDatabase()

Populate the PatientLevelPrediction results tables

-

Shiny Viewers

-

Functions for viewing results via a shiny app

+
+

addRunPlpToDatabase()

+

Function to add the run plp (development or validation) to database

+

createDatabaseSchemaSettings()

+

Create the PatientLevelPrediction database result schema settings

+

createDatabaseList()

+

Create a list with the database details and database meta data entries

+

addDiagnosePlpToDatabase()

+

Insert a diagnostic result into a PLP result schema database

+

addMultipleDiagnosePlpToDatabase()

+

Insert mutliple diagnosePlp results saved to a directory into a PLP result schema database

+

extractDatabaseToCsv()

+

Exports all the results from a database into csv files

+

insertCsvToDatabase()

+

Function to insert results into a database from csvs

+

insertModelDesignInDatabase()

+

Insert a model design into a PLP result schema database

+

migrateDataModel()

+

Migrate Data model

+

Shiny Viewers

+

Functions for viewing results via a shiny app

+

viewPlp()

viewPlp - Interactively view the performance and model settings

+

viewMultiplePlp()

open a local shiny app for viewing the result of a multiple PLP analyses

+

viewDatabaseResultPlp()

open a local shiny app for viewing the result of a PLP analyses from a database

-

Plotting

-

Functions for various performance plots

+
+

Plotting

+

Functions for various performance plots

+

plotPlp()

Plot all the PatientLevelPrediction plots

+

plotSparseRoc()

Plot the ROC curve using the sparse thresholdSummary data frame

+

plotSmoothCalibration()

Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models was defined: from utopia to empirical data" (2016)

+

plotSparseCalibration()

Plot the calibration

+

plotSparseCalibration2()

Plot the conventional calibration

+

plotDemographicSummary()

Plot the Observed vs. expected incidence, by age and gender

+

plotF1Measure()

Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame

+

plotGeneralizability()

Plot the train/test generalizability diagnostic

+

plotPrecisionRecall()

Plot the precision-recall curve using the sparse thresholdSummary data frame

+

plotPredictedPDF()

Plot the Predicted probability density function, showing prediction overlap between true and false cases

+

plotPreferencePDF()

Plot the preference score probability density function, showing prediction overlap between true and false cases #'

+

plotPredictionDistribution()

Plot the side-by-side boxplots of prediction distribution, by class#'

+

plotVariableScatterplot()

Plot the variable importance scatterplot

-

Learning Curves

-

Functions for creating and plotting learning curves

+
+

outcomeSurvivalPlot()

+

Plot the outcome incidence over time

+

Learning Curves

+

Functions for creating and plotting learning curves

+

createLearningCurve()

createLearningCurve

+

plotLearningCurve()

plotLearningCurve

-

Simulation

-

Functions for simulating cohort method data objects.

+
+

Simulation

+

Functions for simulating cohort method data objects.

+

simulatePlpData()

Generate simulated data

- +
+

plpDataSimulationProfile

+

A simulation profile

+

Data manipulation functions

+

Functions for manipulating data

+
+

toSparseM()

+

Convert the plpData in COO format into a sparse R matrix

+

MapIds()

+

Map covariate and row Ids so they start from 1

+

Helper/utility functions

+

+
+

listAppend()

+

join two lists

+

listCartesian()

+

Cartesian product

+

createTempModelLoc()

+

Create a temporary model location

+

configurePython()

+

Sets up a virtual environment to use for PLP (can be conda or python)

+

setPythonEnvironment()

+

Use the virtual environment created using configurePython()

+

Evaluation measures

+

+
+

accuracy()

+

Calculate the accuracy

+

averagePrecision()

+

Calculate the average precision

+

brierScore()

+

brierScore

+

calibrationLine()

+

calibrationLine

+

computeAuc()

+

Compute the area under the ROC curve

+

f1Score()

+

Calculate the f1Score

+

falseDiscoveryRate()

+

Calculate the falseDiscoveryRate

+

falseNegativeRate()

+

Calculate the falseNegativeRate

+

falseOmissionRate()

+

Calculate the falseOmissionRate

+

falsePositiveRate()

+

Calculate the falsePositiveRate

+

ici()

+

Calculate the Integrated Calibration Information from Austin and Steyerberg +https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281

+

modelBasedConcordance()

+

Calculate the model-based concordance, which is a calculation of the expected discrimination performance of a model under the assumption the model predicts the "TRUE" outcome +as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/

+

negativeLikelihoodRatio()

+

Calculate the negativeLikelihoodRatio

+

negativePredictiveValue()

+

Calculate the negativePredictiveValue

+

positiveLikelihoodRatio()

+

Calculate the positiveLikelihoodRatio

+

positivePredictiveValue()

+

Calculate the positivePredictiveValue

+

sensitivity()

+

Calculate the sensitivity

+

specificity()

+

Calculate the specificity

+

computeGridPerformance()

+

Computes grid performance with a specified performance function

+

diagnosticOddsRatio()

+

Calculate the diagnostic odds ratio

+

getCalibrationSummary()

+

Get a sparse summary of the calibration

+

getDemographicSummary()

+

Get a calibration per age/gender groups

+

getThresholdSummary()

+

Calculate all measures for sparse ROC

+

getThresholdSummary_binary()

+

Calculate all measures for sparse ROC when prediction is bianry classification

+

getPredictionDistribution()

+

Calculates the prediction distribution

+

getPredictionDistribution_binary()

+

Calculates the prediction distribution

+

Saving/loading models as json

+

Functions for saving or loading models as json

+
+

sklearnFromJson()

+

Loads sklearn python model from json

+

sklearnToJson()

+

Saves sklearn python model object to json in path

+

Load/save for sharing

+

Functions for loading/saving objects for sharing

+
+

savePlpShareable()

+

Save the plp result as json files and csv files for transparent sharing

+

loadPlpShareable()

+

Loads the plp result saved as json/csv files for transparent sharing

+

loadPrediction()

+

Loads the prediciton dataframe to csv

+

savePrediction()

+

Saves the prediction dataframe to RDS

+

Feature importance

+

+
+

pfi()

+

pfi

+

Other functions

+

+
+

predictCyclops()

+

Create predictive probabilities

+
-
- +
- - + + diff --git a/docs/reference/insertCsvToDatabase.html b/docs/reference/insertCsvToDatabase.html new file mode 100644 index 000000000..80fd8e6cf --- /dev/null +++ b/docs/reference/insertCsvToDatabase.html @@ -0,0 +1,200 @@ + +Function to insert results into a database from csvs — insertCsvToDatabase • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function converts a folder with csv results into plp objects and loads them into a plp result database

+
+ +
+
insertCsvToDatabase(
+  csvFolder,
+  connectionDetails,
+  databaseSchemaSettings,
+  modelSaveLocation,
+  csvTableAppend = ""
+)
+
+ +
+

Arguments

+
csvFolder
+

The location to the csv folder with the plp results

+ + +
connectionDetails
+

A connection details for the plp results database that the csv results will be inserted into

+ + +
databaseSchemaSettings
+

A object created by createDatabaseSchemaSettings with all the settings specifying the result tables to insert the csv results into

+ + +
modelSaveLocation
+

The location to save any models from the csv folder - this should be the same location you picked when inserting other models into the database

+ + +
csvTableAppend
+

A string that appends the csv file names

+ +
+
+

Value

+ + +

Returns a data.frame indicating whether the results were inported into the database

+
+
+

Details

+

The user needs to have plp csv results in a single folder and an existing plp result database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/insertDbPopulation.html b/docs/reference/insertDbPopulation.html deleted file mode 100644 index c733a5724..000000000 --- a/docs/reference/insertDbPopulation.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - -Insert a population into a database — insertDbPopulation • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Insert a population into a database

- -
- -
insertDbPopulation(population, cohortIds = 1, connectionDetails,
-  cohortDatabaseSchema, cohortTable = "cohort", createTable = FALSE,
-  dropTableIfExists = TRUE, cdmVersion = "5")
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
population

Either an object of type plpData or a population object generated by functions -like createStudyPopulation.

cohortIds

The IDs to be used for the treated and comparator cohort, respectively.

connectionDetails

An R object of type
connectionDetails created using the -function createConnectionDetails in the -DatabaseConnector package.

cohortDatabaseSchema

The name of the database schema where the data will be written. -Requires write permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

cohortTable

The name of the table in the database schema where the data will be written.

createTable

Should a new table be created? If not, the data will be inserted into an existing -table.

dropTableIfExists

If createTable = TRUE and the table already exists it will be overwritten.

cdmVersion

Define the OMOP CDM version used: currently support "4" and "5".

- -

Details

- -

Inserts a population table into a database. The table in the database will have the same structure as the -'cohort' table in the Common Data Model.

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/insertModelDesignInDatabase.html b/docs/reference/insertModelDesignInDatabase.html new file mode 100644 index 000000000..a69c6aa60 --- /dev/null +++ b/docs/reference/insertModelDesignInDatabase.html @@ -0,0 +1,197 @@ + +Insert a model design into a PLP result schema database — insertModelDesignInDatabase • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function inserts a model design and all the settings into the result schema

+
+ +
+
insertModelDesignInDatabase(
+  object,
+  conn,
+  databaseSchemaSettings,
+  cohortDefinitions
+)
+
+ +
+

Arguments

+
object
+

An object of class modelDesign, runPlp or externalValidatePlp

+ + +
conn
+

A connection to a database created by using the +function connect in the +DatabaseConnector package.

+ + +
databaseSchemaSettings
+

A object created by createDatabaseSchemaSettings with all the settings specifying the result tables

+ + +
cohortDefinitions
+

A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()

+ +
+
+

Value

+ + +

Returns NULL but uploads the model design into the database schema specified in databaseSchemaSettings

+
+
+

Details

+

This function can be used to upload a model design into a database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/insertResultsToSqlite.html b/docs/reference/insertResultsToSqlite.html new file mode 100644 index 000000000..1fe355a82 --- /dev/null +++ b/docs/reference/insertResultsToSqlite.html @@ -0,0 +1,195 @@ + +Create sqlite database with the results — insertResultsToSqlite • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

This function create an sqlite database with the PLP result schema and inserts all results

+
+ +
+
insertResultsToSqlite(
+  resultLocation,
+  cohortDefinitions,
+  databaseList = NULL,
+  sqliteLocation = file.path(resultLocation, "sqlite")
+)
+
+ +
+

Arguments

+
resultLocation
+

(string) location of directory where the main package results were saved

+ + +
cohortDefinitions
+

A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()

+ + +
databaseList
+

A list created by createDatabaseList to specify the databases

+ + +
sqliteLocation
+

(string) location of directory where the sqlite database will be saved

+ +
+
+

Value

+ + +

Returns the location of the sqlite database file

+
+
+

Details

+

This function can be used upload PatientLevelPrediction results into an sqlite database

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/interpretInstallCode.html b/docs/reference/interpretInstallCode.html deleted file mode 100644 index a8fcd6e16..000000000 --- a/docs/reference/interpretInstallCode.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Tells you the package issue — interpretInstallCode • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Tells you the package issue

-
- -
interpretInstallCode(response)
- -

Arguments

- - - - - - -
response

The response code from checkPlpInstallation()

- -

Details

- -

This function prints any issues found during the checkPlpInstallation() call

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/launchDiagnosticsExplorer.html b/docs/reference/launchDiagnosticsExplorer.html deleted file mode 100644 index 17135c991..000000000 --- a/docs/reference/launchDiagnosticsExplorer.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Launch the Diagnostics Explorer Shiny app — launchDiagnosticsExplorer • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Launch the Diagnostics Explorer Shiny app

-
- -
launchDiagnosticsExplorer(dataFolder, launch.browser = FALSE)
- -

Arguments

- - - - - - - - - - -
dataFolder

A folder where the exported zip files with the results are stored. -Zip files containing results from multiple databases can be placed in the same -folder.

launch.browser

Should the app be launched in your default browser, or in a Shiny window. -Note: copying to clipboard will not work in a Shiny window.

- -

Details

- -

Launches a Shiny app that allows the user to explore the diagnostics

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/listAppend.html b/docs/reference/listAppend.html index d98a5a463..2f370ad63 100644 --- a/docs/reference/listAppend.html +++ b/docs/reference/listAppend.html @@ -1,67 +1,12 @@ - - - - - - - -join two lists — listAppend • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -join two lists — listAppend • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,50 +131,46 @@

join two lists

join two lists

-
listAppend(a, b)
+
+
listAppend(a, b)
+
+ +
+

Arguments

+
a
+

A list

-

Arguments

- - - - - - - - - - -
a

A list

b

Another list

-

Details

+
b
+

Another list

+
+
+

Details

This function joins two lists

+
+
-
- +
- - + + diff --git a/docs/reference/listCartesian.html b/docs/reference/listCartesian.html new file mode 100644 index 000000000..5c5c573b7 --- /dev/null +++ b/docs/reference/listCartesian.html @@ -0,0 +1,174 @@ + +Cartesian product — listCartesian • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Computes the Cartesian product of all the combinations of elements in a list

+
+ +
+
listCartesian(allList)
+
+ +
+

Arguments

+
allList
+

a list of lists

+ +
+
+

Value

+ + +

A list with all possible combinations from the input list of lists

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/loadEnsemblePlpModel.html b/docs/reference/loadEnsemblePlpModel.html deleted file mode 100644 index 117c7a11c..000000000 --- a/docs/reference/loadEnsemblePlpModel.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -loads the Ensmeble plp model and return a model list — loadEnsemblePlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

loads the Ensmeble plp model and return a model list

-
- -
loadEnsemblePlpModel(dirPath)
- -

Arguments

- - - - - - -
dirPath

The location of the model

- -

Details

- -

Loads a plp model list that was saved using savePlpModel()

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/loadEnsemblePlpResult.html b/docs/reference/loadEnsemblePlpResult.html deleted file mode 100644 index 231ea5eb7..000000000 --- a/docs/reference/loadEnsemblePlpResult.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -loads the Ensemble plp results — loadEnsemblePlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

loads the Ensemble plp results

-
- -
loadEnsemblePlpResult(dirPath)
- -

Arguments

- - - - - - -
dirPath

The location of the model

- -

Details

- -

Loads a plp model list that was saved using saveEnsemblePlpResults()

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/loadPlpAnalysesJson.html b/docs/reference/loadPlpAnalysesJson.html index 513a9a062..4d1a0a0cc 100644 --- a/docs/reference/loadPlpAnalysesJson.html +++ b/docs/reference/loadPlpAnalysesJson.html @@ -1,67 +1,12 @@ - - - - - - - -Load the multiple prediction json settings from a file — loadPlpAnalysesJson • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Load the multiple prediction json settings from a file — loadPlpAnalysesJson • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,53 +131,51 @@

Load the multiple prediction json settings from a file

Load the multiple prediction json settings from a file

-
loadPlpAnalysesJson(jsonFileLocation)
- -

Arguments

- - - - - - -
jsonFileLocation

The location of the file 'predictionAnalysisList.json' with the modelDesignList

+
+
loadPlpAnalysesJson(jsonFileLocation)
+
-

Details

+
+

Arguments

+
jsonFileLocation
+

The location of the file 'predictionAnalysisList.json' with the modelDesignList

+
+
+

Details

This function interprets a json with the multiple prediction settings and creates a list that can be combined with connection settings to run a multiple prediction study

+
-

Examples

-
if (FALSE) { -modelDesignList <- loadPlpAnalysesJson('location of json settings')$analysis -} - -
+
+

Examples

+
if (FALSE) {
+modelDesignList <- loadPlpAnalysesJson('location of json settings')$analysis
+}
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/loadPlpData.html b/docs/reference/loadPlpData.html index a8da4e124..ab34cac18 100644 --- a/docs/reference/loadPlpData.html +++ b/docs/reference/loadPlpData.html @@ -1,68 +1,13 @@ - - - - - - - -Load the cohort data from a folder — loadPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Load the cohort data from a folder — loadPlpData • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -185,57 +133,59 @@

Load the cohort data from a folder

system.

-
loadPlpData(file, readOnly = TRUE)
+
+
loadPlpData(file, readOnly = TRUE)
+
-

Arguments

- - - - - - - - - - -
file

The name of the folder containing the data.

readOnly

If true, the data is opened read only.

+
+

Arguments

+
file
+

The name of the folder containing the data.

-

Value

-

An object of class plpData.

-

Details

+
readOnly
+

If true, the data is opened read only.

-

The data will be written to a set of files in the folder specified by the user.

+
+
+

Value

+ -

Examples

-
# todo +

An object of class plpData.

+
+
+

Details

+

The data will be written to a set of files in the folder specified by the user.

+
-
+
+

Examples

+
# todo
+
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/loadPlpFromCsv.html b/docs/reference/loadPlpFromCsv.html deleted file mode 100644 index c51564912..000000000 --- a/docs/reference/loadPlpFromCsv.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Loads parts of the plp result saved as csv files for transparent sharing — loadPlpFromCsv • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Loads parts of the plp result saved as csv files for transparent sharing

-
- -
loadPlpFromCsv(dirPath)
- -

Arguments

- - - - - - -
dirPath

The directory with the results as csv files

- -

Details

- -

Load the main results from csv files into a runPlp object

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/loadPlpModel.html b/docs/reference/loadPlpModel.html index e78445f92..3248a457e 100644 --- a/docs/reference/loadPlpModel.html +++ b/docs/reference/loadPlpModel.html @@ -1,67 +1,12 @@ - - - - - - - -loads the plp model — loadPlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -loads the plp model — loadPlpModel • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,46 +131,42 @@

loads the plp model

loads the plp model

-
loadPlpModel(dirPath)
- -

Arguments

- - - - - - -
dirPath

The location of the model

+
+
loadPlpModel(dirPath)
+
-

Details

+
+

Arguments

+
dirPath
+

The location of the model

-

Loads a plp model that was saved using savePlpModel()

+
+
+

Details

+

Loads a plp model that was saved using savePlpModel()

+
+
-
- +
- - + + diff --git a/docs/reference/loadPlpResult.html b/docs/reference/loadPlpResult.html index 961c710ec..25327ba94 100644 --- a/docs/reference/loadPlpResult.html +++ b/docs/reference/loadPlpResult.html @@ -1,67 +1,12 @@ - - - - - - - -Loads the evalaution dataframe — loadPlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Loads the evalaution dataframe — loadPlpResult • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,46 +131,42 @@

Loads the evalaution dataframe

Loads the evalaution dataframe

-
loadPlpResult(dirPath)
- -

Arguments

- - - - - - -
dirPath

The directory where the evaluation was saved

+
+
loadPlpResult(dirPath)
+
-

Details

+
+

Arguments

+
dirPath
+

The directory where the evaluation was saved

+
+
+

Details

Loads the evaluation

+
+
-
- +
- - + + diff --git a/docs/reference/loadPlpShareable.html b/docs/reference/loadPlpShareable.html index 774b07d56..a69479f3d 100644 --- a/docs/reference/loadPlpShareable.html +++ b/docs/reference/loadPlpShareable.html @@ -1,67 +1,12 @@ - - - - - - - -Loads the plp result saved as json/csv files for transparent sharing — loadPlpShareable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Loads the plp result saved as json/csv files for transparent sharing — loadPlpShareable • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,46 +131,42 @@

Loads the plp result saved as json/csv files for transparent sharing

Loads the plp result saved as json/csv files for transparent sharing

-
loadPlpShareable(loadDirectory)
- -

Arguments

- - - - - - -
loadDirectory

The directory with the results as json/csv files

+
+
loadPlpShareable(loadDirectory)
+
-

Details

+
+

Arguments

+
loadDirectory
+

The directory with the results as json/csv files

+
+
+

Details

Load the main results from json/csv files into a runPlp object

+
+
-
- +
- - + + diff --git a/docs/reference/loadPrediction.html b/docs/reference/loadPrediction.html index f2ec47d45..a4715f9c5 100644 --- a/docs/reference/loadPrediction.html +++ b/docs/reference/loadPrediction.html @@ -1,67 +1,12 @@ - - - - - - - -Loads the prediciton dataframe to csv — loadPrediction • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Loads the prediciton dataframe to csv — loadPrediction • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,46 +131,42 @@

Loads the prediciton dataframe to csv

Loads the prediciton dataframe to csv

-
loadPrediction(fileLocation)
- -

Arguments

- - - - - - -
fileLocation

The location with the saved prediction

+
+
loadPrediction(fileLocation)
+
-

Details

+
+

Arguments

+
fileLocation
+

The location with the saved prediction

+
+
+

Details

Loads the prediciton RDS file

+
+
-
- +

- - + + diff --git a/docs/reference/loadPredictionAnalysisList.html b/docs/reference/loadPredictionAnalysisList.html deleted file mode 100644 index 0ae126bb7..000000000 --- a/docs/reference/loadPredictionAnalysisList.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Load the multiple prediction json settings from a file — loadPredictionAnalysisList • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Load the multiple prediction json settings from a file

-
- -
loadPredictionAnalysisList(predictionAnalysisListFile)
- -

Arguments

- - - - - - -
predictionAnalysisListFile

The prediciton specification json extracted from atlas.

- -

Details

- -

This function interprets a json with the multiple prediction settings and creates a list -that can be combined with connection settings to run a multiple prediction study

- -

Examples

-
if (FALSE) { -predictionAnalysisList <- loadPredictionAnalysisList('./predictionStudyAnalyses.json') -predictionAnalysisList$connectionDetails = connectionDetails -predictionAnalysisList$cdmDatabaseSchema = cdmDatabaseSchema -predictionAnalysisList$cdmDatabaseName = cdmDatabaseName -predictionAnalysisList$oracleTempSchema = oracleTempSchema -predictionAnalysisList$cohortDatabaseSchema = cohortDatabaseSchema -predictionAnalysisList$cohortTable = cohortTable -predictionAnalysisList$outcomeDatabaseSchema = outcomeDatabaseSchema -predictionAnalysisList$outcomeTable = outcomeTable -predictionAnalysisList$cdmVersion = cdmVersion -predictionAnalysisList$outputFolder = outputFolder -result <- do.call(runPlpAnalyses, predictionAnalysisList) -} - -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/migrateDataModel.html b/docs/reference/migrateDataModel.html new file mode 100644 index 000000000..5810fedca --- /dev/null +++ b/docs/reference/migrateDataModel.html @@ -0,0 +1,180 @@ + +Migrate Data model — migrateDataModel • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Migrate data from current state to next state

+

It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +are using a postgres backend) or have kept the csv/zip files from your data generation.

+
+ +
+
migrateDataModel(connectionDetails, databaseSchema, tablePrefix = "")
+
+ +
+

Arguments

+
connectionDetails
+

DatabaseConnector connection details object

+ + +
databaseSchema
+

String schema where database schema lives

+ + +
tablePrefix
+

(Optional) Use if a table prefix is used before table names (e.g. "cd_")

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/modelBasedConcordance.html b/docs/reference/modelBasedConcordance.html index 2665220c4..b18d502df 100644 --- a/docs/reference/modelBasedConcordance.html +++ b/docs/reference/modelBasedConcordance.html @@ -1,70 +1,14 @@ - - - - - - - -Calculate the model-based concordance, which is a calculation of the expected discrimination performance of a model under the assumption the model predicts the "TRUE" outcome -as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/ — modelBasedConcordance • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the model-based concordance, which is a calculation of the expected discrimination performance of a model under the assumption the model predicts the "TRUE" outcome +as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/ — modelBasedConcordance • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -188,49 +135,48 @@

Calculate the model-based concordance, which is a calculation of the expecte as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/

-
modelBasedConcordance(prediction)
- -

Arguments

- - - - - - -
prediction

the prediction object found in the plpResult object

+
+
modelBasedConcordance(prediction)
+
-

Value

+
+

Arguments

+
prediction
+

the prediction object found in the plpResult object

-

model-based concordance value

-

Details

+
+
+

Value

+ +

model-based concordance value

+
+
+

Details

Calculate the model-based concordance

+
+
-
- +
- - + + diff --git a/docs/reference/negativeLikelihoodRatio.html b/docs/reference/negativeLikelihoodRatio.html index 9082c2e50..a1fa5b4f2 100644 --- a/docs/reference/negativeLikelihoodRatio.html +++ b/docs/reference/negativeLikelihoodRatio.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the negativeLikelihoodRatio — negativeLikelihoodRatio • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the negativeLikelihoodRatio — negativeLikelihoodRatio • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the negativeLikelihoodRatio

Calculate the negativeLikelihoodRatio

-
negativeLikelihoodRatio(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

negativeLikelihoodRatio value

-

Details

+
+
negativeLikelihoodRatio(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

negativeLikelihoodRatio value

+
+
+

Details

Calculate the negativeLikelihoodRatio

+
+
-
- +
- - + + diff --git a/docs/reference/negativePredictiveValue.html b/docs/reference/negativePredictiveValue.html index 5fb3da3af..816870bb3 100644 --- a/docs/reference/negativePredictiveValue.html +++ b/docs/reference/negativePredictiveValue.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the negativePredictiveValue — negativePredictiveValue • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the negativePredictiveValue — negativePredictiveValue • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the negativePredictiveValue

Calculate the negativePredictiveValue

-
negativePredictiveValue(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

negativePredictiveValue value

-

Details

+
+
negativePredictiveValue(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

negativePredictiveValue value

+
+
+

Details

Calculate the negativePredictiveValue

+
+
-
- +
- - + + diff --git a/docs/reference/outcomeSurvivalPlot.html b/docs/reference/outcomeSurvivalPlot.html index c3fa6bfb5..fc364c7e5 100644 --- a/docs/reference/outcomeSurvivalPlot.html +++ b/docs/reference/outcomeSurvivalPlot.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the outcome incidence over time — outcomeSurvivalPlot • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the outcome incidence over time — outcomeSurvivalPlot • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,79 +131,78 @@

Plot the outcome incidence over time

Plot the outcome incidence over time

-
outcomeSurvivalPlot(
-  plpData,
-  outcomeId,
-  populationSettings = createStudyPopulationSettings(binary = T, includeAllOutcomes =
-    T, firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome =
-    TRUE, priorOutcomeLookback = 99999, requireTimeAtRisk = F, riskWindowStart = 1,
-    startAnchor = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"),
-  riskTable = T,
-  confInt = T,
-  yLabel = "Fraction of those who are outcome free in target population"
-)
+
+
outcomeSurvivalPlot(
+  plpData,
+  outcomeId,
+  populationSettings = createStudyPopulationSettings(binary = T, includeAllOutcomes = T,
+    firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome = TRUE,
+    priorOutcomeLookback = 99999, requireTimeAtRisk = F, riskWindowStart = 1, startAnchor
+    = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"),
+  riskTable = T,
+  confInt = T,
+  yLabel = "Fraction of those who are outcome free in target population"
+)
+
+ +
+

Arguments

+
plpData
+

The plpData object returned by running getPlpData()

+ + +
outcomeId
+

The cohort id corresponding to the outcome

+ + +
populationSettings
+

The population settings created using createStudyPopulationSettings

+ + +
riskTable
+

(binary) Whether to include a table at the bottom of the plot showing the number of people at risk over time

-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
plpData

The plpData object returned by running getPlpData()

outcomeId

The cohort id corresponding to the outcome

populationSettings

The population settings created using createStudyPopulationSettings

riskTable

(binary) Whether to include a table at the bottom of the plot showing the number of people at risk over time

confInt

(binary) Whether to include a confidence interval

yLabel

(string) The label for the y-axis

-

Value

+
confInt
+

(binary) Whether to include a confidence interval

-

TRUE if it ran

-

Details

+
yLabel
+

(string) The label for the y-axis

+ +
+
+

Value

+ + +

TRUE if it ran

+
+
+

Details

This creates a survival plot that can be used to pick a suitable time-at-risk period

+
+
-
- +
- - + + diff --git a/docs/reference/personSplitter.html b/docs/reference/personSplitter.html deleted file mode 100644 index 8b337e3dc..000000000 --- a/docs/reference/personSplitter.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Split data into random subsets stratified by class — personSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Split data into random subsets stratified by class

-
- -
personSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
population

An object created using createStudyPopulation().

test

A real number between 0 and 1 indicating the test set fraction of the data

train

A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

nfold

An integer >= 1 specifying the number of folds used in cross validation

seed

If set a fixed seed is used, otherwise a random split is performed

- -

Value

- -

A dataframe containing the columns: rowId and index

-

Details

- -

Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/pfi.html b/docs/reference/pfi.html index 61a9d5dc6..8a039c50a 100644 --- a/docs/reference/pfi.html +++ b/docs/reference/pfi.html @@ -1,67 +1,12 @@ - - - - - - - -pfi — pfi • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -pfi — pfi • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,87 +131,86 @@

pfi

Calculate the permutation feature importance for a PLP model.

-
pfi(
-  plpResult,
-  population,
-  plpData,
-  repeats = 1,
-  covariates = NULL,
-  cores = NULL,
-  log = NULL,
-  logthreshold = "INFO"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

An object of type runPlp

population

The population created using createStudyPopulation() who will have their risks predicted

plpData

An object of type plpData - the patient level prediction -data extracted from the CDM.

repeats

The number of times to permute each covariate

covariates

A vector of covariates to calculate the pfi for. If NULL it uses all covariates included in the model.

cores

Number of cores to use when running this (it runs in parallel)

log

A location to save the log for running pfi

logthreshold

The log threshold (e.g., INFO, TRACE, ...)

- -

Value

- -

A dataframe with the covariateIds and the pfi (change in AUC caused by permuting the covariate) value

-

Details

+
+
pfi(
+  plpResult,
+  population,
+  plpData,
+  repeats = 1,
+  covariates = NULL,
+  cores = NULL,
+  log = NULL,
+  logthreshold = "INFO"
+)
+
+ +
+

Arguments

+
plpResult
+

An object of type runPlp

+ + +
population
+

The population created using createStudyPopulation() who will have their risks predicted

+ + +
plpData
+

An object of type plpData - the patient level prediction +data extracted from the CDM.

+ +
repeats
+

The number of times to permute each covariate

+ + +
covariates
+

A vector of covariates to calculate the pfi for. If NULL it uses all covariates included in the model.

+ + +
cores
+

Number of cores to use when running this (it runs in parallel)

+ + +
log
+

A location to save the log for running pfi

+ + +
logthreshold
+

The log threshold (e.g., INFO, TRACE, ...)

+ +
+
+

Value

+ + +

A dataframe with the covariateIds and the pfi (change in AUC caused by permuting the covariate) value

+
+
+

Details

The function permutes the each covariate/features <repeats> times and calculates the mean AUC change caused by the permutation.

+
+
-
- +
- - + + diff --git a/docs/reference/plotDemographicSummary.html b/docs/reference/plotDemographicSummary.html index 08466abab..8756040c8 100644 --- a/docs/reference/plotDemographicSummary.html +++ b/docs/reference/plotDemographicSummary.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the Observed vs. expected incidence, by age and gender — plotDemographicSummary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the Observed vs. expected incidence, by age and gender — plotDemographicSummary • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,70 +131,69 @@

Plot the Observed vs. expected incidence, by age and gender

Plot the Observed vs. expected incidence, by age and gender

-
plotDemographicSummary(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "roc.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotDemographicSummary(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "roc.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the Observed vs. expected incidence, by age and gender #'

+
+

-
- +
- - + + diff --git a/docs/reference/plotF1Measure.html b/docs/reference/plotF1Measure.html index bbd38b250..aec0a7eae 100644 --- a/docs/reference/plotF1Measure.html +++ b/docs/reference/plotF1Measure.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame — plotF1Measure • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame — plotF1Measure • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,69 +131,68 @@

Plot the F1 measure efficiency frontier using the sparse thresholdSummary da

Plot the F1 measure efficiency frontier using the sparse thresholdSummary data frame

-
plotF1Measure(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "roc.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotF1Measure(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "roc.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the F1 measure efficiency frontier using the sparse thresholdSummary data frame

+
+
-
- +
- - + + diff --git a/docs/reference/plotGeneralizability.html b/docs/reference/plotGeneralizability.html index 806b1e2da..f91a77613 100644 --- a/docs/reference/plotGeneralizability.html +++ b/docs/reference/plotGeneralizability.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the train/test generalizability diagnostic — plotGeneralizability • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the train/test generalizability diagnostic — plotGeneralizability • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,66 +131,65 @@

Plot the train/test generalizability diagnostic

Plot the train/test generalizability diagnostic

-
plotGeneralizability(
-  covariateSummary,
-  saveLocation = NULL,
-  fileName = "Generalizability.png"
-)
- -

Arguments

- - - - - - - - - - - - - - -
covariateSummary

A prediction object as generated using the -runPlp function.

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotGeneralizability(
+  covariateSummary,
+  saveLocation = NULL,
+  fileName = "Generalizability.png"
+)
+
+ +
+

Arguments

+
covariateSummary
+

A prediction object as generated using the +runPlp function.

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the train/test generalizability diagnostic #'

+
+
-
- +
- - + + diff --git a/docs/reference/plotLearningCurve.html b/docs/reference/plotLearningCurve.html index a3a514041..803ab6e4f 100644 --- a/docs/reference/plotLearningCurve.html +++ b/docs/reference/plotLearningCurve.html @@ -1,68 +1,13 @@ - - - - - - - -plotLearningCurve — plotLearningCurve • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -plotLearningCurve — plotLearningCurve • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -185,96 +133,94 @@

plotLearningCurve

from createLearningCurve.

-
plotLearningCurve(
-  learningCurve,
-  metric = "AUROC",
-  abscissa = "events",
-  plotTitle = "Learning Curve",
-  plotSubtitle = NULL,
-  fileName = NULL
-)
+
+
plotLearningCurve(
+  learningCurve,
+  metric = "AUROC",
+  abscissa = "events",
+  plotTitle = "Learning Curve",
+  plotSubtitle = NULL,
+  fileName = NULL
+)
+
+ +
+

Arguments

+
learningCurve
+

An object returned by createLearningCurve +function.

+ -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
learningCurve

An object returned by createLearningCurve -function.

metric

Specifies the metric to be plotted:

    -
  • 'AUROC' - use the area under the Receiver Operating +

    metric
    +

    Specifies the metric to be plotted:

    • 'AUROC' - use the area under the Receiver Operating Characteristic curve

    • 'AUPRC' - use the area under the Precision-Recall curve

    • 'sBrier' - use the scaled Brier score

    • -
abscissa

Specify the abscissa metric to be plotted:

    -
  • 'events' - use number of events

  • +
+ + +
abscissa
+

Specify the abscissa metric to be plotted:

  • 'events' - use number of events

  • 'observations' - use number of observations

  • -
plotTitle

Title of the learning curve plot.

plotSubtitle

Subtitle of the learning curve plot.

fileName

Filename of plot to be saved, for example 'plot.png'. + + + +

plotTitle
+

Title of the learning curve plot.

+ + +
plotSubtitle
+

Subtitle of the learning curve plot.

+ + +
fileName
+

Filename of plot to be saved, for example 'plot.png'. See the function ggsave in the ggplot2 package for supported file -formats.

+formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to +

A ggplot object. Use the ggsave function to save to file in a different format.

+
-

Examples

-
if (FALSE) { -# create learning curve object -learningCurve <- createLearningCurve(population, - plpData, - modelSettings) -# plot the learning curve -plotLearningCurve(learningCurve) -} - -
+
+

Examples

+
if (FALSE) {
+# create learning curve object
+learningCurve <- createLearningCurve(population,
+                                     plpData,
+                                     modelSettings)
+# plot the learning curve
+plotLearningCurve(learningCurve)
+}
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/plotPlp.html b/docs/reference/plotPlp.html index 199ff8253..0ae53ad83 100644 --- a/docs/reference/plotPlp.html +++ b/docs/reference/plotPlp.html @@ -1,67 +1,12 @@ - - - - - - - -Plot all the PatientLevelPrediction plots — plotPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot all the PatientLevelPrediction plots — plotPlp • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,58 +131,57 @@

Plot all the PatientLevelPrediction plots

Plot all the PatientLevelPrediction plots

-
plotPlp(plpResult, saveLocation = NULL, typeColumn = "evaluation")
- -

Arguments

- - - - - - - - - - - - - - -
plpResult

Object returned by the runPlp() function

saveLocation

Name of the directory where the plots should be saved (NULL means no saving)

typeColumn

The name of the column specifying the evaluation type -(to stratify the plots)

- -

Value

- -

TRUE if it ran

-

Details

+
+
plotPlp(plpResult, saveLocation = NULL, typeColumn = "evaluation")
+
+ +
+

Arguments

+
plpResult
+

Object returned by the runPlp() function

+ + +
saveLocation
+

Name of the directory where the plots should be saved (NULL means no saving)

+ +
typeColumn
+

The name of the column specifying the evaluation type +(to stratify the plots)

+ +
+
+

Value

+ + +

TRUE if it ran

+
+
+

Details

Create a directory with all the plots

+
+
-
- +
- - + + diff --git a/docs/reference/plotPrecisionRecall.html b/docs/reference/plotPrecisionRecall.html index fba15177f..9296f567e 100644 --- a/docs/reference/plotPrecisionRecall.html +++ b/docs/reference/plotPrecisionRecall.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the precision-recall curve using the sparse thresholdSummary data frame — plotPrecisionRecall • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the precision-recall curve using the sparse thresholdSummary data frame — plotPrecisionRecall • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,69 +131,68 @@

Plot the precision-recall curve using the sparse thresholdSummary data frame

Plot the precision-recall curve using the sparse thresholdSummary data frame

-
plotPrecisionRecall(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "roc.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotPrecisionRecall(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "roc.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the precision-recall curve using the sparse thresholdSummary data frame

+
+
-
- +
- - + + diff --git a/docs/reference/plotPredictedPDF.html b/docs/reference/plotPredictedPDF.html index 9e51a26f6..f95ad0bc3 100644 --- a/docs/reference/plotPredictedPDF.html +++ b/docs/reference/plotPredictedPDF.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the Predicted probability density function, showing prediction overlap between true and false cases — plotPredictedPDF • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the Predicted probability density function, showing prediction overlap between true and false cases — plotPredictedPDF • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,69 +131,68 @@

Plot the Predicted probability density function, showing prediction overlap

Plot the Predicted probability density function, showing prediction overlap between true and false cases

-
plotPredictedPDF(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "PredictedPDF.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotPredictedPDF(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "PredictedPDF.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the predicted probability density function, showing prediction overlap between true and false cases

+
+
-
- +
- - + + diff --git a/docs/reference/plotPredictionDistribution.html b/docs/reference/plotPredictionDistribution.html index a2e751e3e..f39445b5c 100644 --- a/docs/reference/plotPredictionDistribution.html +++ b/docs/reference/plotPredictionDistribution.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the side-by-side boxplots of prediction distribution, by class#' — plotPredictionDistribution • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the side-by-side boxplots of prediction distribution, by class#' — plotPredictionDistribution • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,70 +131,69 @@

Plot the side-by-side boxplots of prediction distribution, by class#'

Plot the side-by-side boxplots of prediction distribution, by class#'

-
plotPredictionDistribution(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "PredictionDistribution.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotPredictionDistribution(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "PredictionDistribution.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the side-by-side boxplots of prediction distribution, by class #'

+
+
-
- +
- - + + diff --git a/docs/reference/plotPreferencePDF.html b/docs/reference/plotPreferencePDF.html index 2b7105906..8b26d68a7 100644 --- a/docs/reference/plotPreferencePDF.html +++ b/docs/reference/plotPreferencePDF.html @@ -1,70 +1,15 @@ - - - - - - - -Plot the preference score probability density function, showing prediction overlap between true and false cases -#' — plotPreferencePDF • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the preference score probability density function, showing prediction overlap between true and false cases +#' — plotPreferencePDF • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -188,70 +136,69 @@

Plot the preference score probability density function, showing prediction o #'

-
plotPreferencePDF(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "plotPreferencePDF.png"
-)
+
+
plotPreferencePDF(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "plotPreferencePDF.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

-

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example + +

fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the preference score probability density function, showing prediction overlap between true and false cases #'

+
+
-
- +
- - + + diff --git a/docs/reference/plotRoc.html b/docs/reference/plotRoc.html deleted file mode 100644 index 247e2f260..000000000 --- a/docs/reference/plotRoc.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - -Plot the ROC curve — plotRoc • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Plot the ROC curve

-
- -
plotRoc(prediction, fileName = NULL)
- -

Arguments

- - - - - - - - - - -
prediction

A prediction object as generated using the -predictProbabilities function.

fileName

Name of the file where the plot should be saved, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

- -

Value

- -

A ggplot object. Use the ggsave function to save to file in a different -format.

-

Details

- -

Create a plot showing the Receiver Operator Characteristics (ROC) curve.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/plotSmoothCalibration.html b/docs/reference/plotSmoothCalibration.html index 05215025e..bb6bd7cd4 100644 --- a/docs/reference/plotSmoothCalibration.html +++ b/docs/reference/plotSmoothCalibration.html @@ -1,70 +1,14 @@ - - - - - - - -Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models -was defined: from utopia to empirical data" (2016) — plotSmoothCalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models +was defined: from utopia to empirical data" (2016) — plotSmoothCalibration • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -188,102 +135,101 @@

Plot the smooth calibration as detailed in Calster et al. "A calibration hei was defined: from utopia to empirical data" (2016)

-
plotSmoothCalibration(
-  plpResult,
-  smooth = "loess",
-  span = 0.75,
-  nKnots = 5,
-  scatter = FALSE,
-  bins = 20,
-  sample = TRUE,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "smoothCalibration.pdf"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

The result of running runPlp function. An object containing the +

+
plotSmoothCalibration(
+  plpResult,
+  smooth = "loess",
+  span = 0.75,
+  nKnots = 5,
+  scatter = FALSE,
+  bins = 20,
+  sample = TRUE,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "smoothCalibration.pdf"
+)
+
+ +
+

Arguments

+
plpResult
+

The result of running runPlp function. An object containing the model or location where the model is save, the data selection settings, the preprocessing and training settings as well as various performance measures -obtained by the model.

smooth

options: 'loess' or 'rcs'

span

This specifies the width of span used for loess. This will allow for faster -computing and lower memory usage.

nKnots

The number of knots to be used by the rcs evaluation. Default is 5

scatter

plot the decile calibrations as points on the graph. Default is False

bins

The number of bins for the histogram. Default is 20.

sample

If using loess then by default 20,000 patients will be sampled to save time

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example -'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+obtained by the model.

+ + +
smooth
+

options: 'loess' or 'rcs'

+ + +
span
+

This specifies the width of span used for loess. This will allow for faster +computing and lower memory usage.

+ + +
nKnots
+

The number of knots to be used by the rcs evaluation. Default is 5

+ -

Value

+
scatter
+

plot the decile calibrations as points on the graph. Default is False

-

A ggplot object.

-

Details

+
bins
+

The number of bins for the histogram. Default is 20.

+ + +
sample
+

If using loess then by default 20,000 patients will be sampled to save time

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example +'plot.png'. See the function ggsave in the ggplot2 package for +supported file formats.

+ +
+
+

Value

+ + +

A ggplot object.

+
+
+

Details

Create a plot showing the smoothed calibration #'

+
+
-
- +
- - + + diff --git a/docs/reference/plotSparseCalibration.html b/docs/reference/plotSparseCalibration.html index fbe0f0372..cb1bc73d3 100644 --- a/docs/reference/plotSparseCalibration.html +++ b/docs/reference/plotSparseCalibration.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the calibration — plotSparseCalibration • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the calibration — plotSparseCalibration • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,70 +131,69 @@

Plot the calibration

Plot the calibration

-
plotSparseCalibration(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "roc.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotSparseCalibration(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "roc.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the calibration #'

+
+
-
- +
- - + + diff --git a/docs/reference/plotSparseCalibration2.html b/docs/reference/plotSparseCalibration2.html index 128e137a7..34929c48a 100644 --- a/docs/reference/plotSparseCalibration2.html +++ b/docs/reference/plotSparseCalibration2.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the conventional calibration — plotSparseCalibration2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the conventional calibration — plotSparseCalibration2 • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,70 +131,69 @@

Plot the conventional calibration

Plot the conventional calibration

-
plotSparseCalibration2(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "roc.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotSparseCalibration2(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "roc.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the calibration #'

+
+
-
- +
- - + + diff --git a/docs/reference/plotSparseRoc.html b/docs/reference/plotSparseRoc.html index 1f1f410e8..37ef3c135 100644 --- a/docs/reference/plotSparseRoc.html +++ b/docs/reference/plotSparseRoc.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the ROC curve using the sparse thresholdSummary data frame — plotSparseRoc • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the ROC curve using the sparse thresholdSummary data frame — plotSparseRoc • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,69 +131,68 @@

Plot the ROC curve using the sparse thresholdSummary data frame

Plot the ROC curve using the sparse thresholdSummary data frame

-
plotSparseRoc(
-  plpResult,
-  typeColumn = "evaluation",
-  saveLocation = NULL,
-  fileName = "roc.png"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpResult

A plp result object as generated using the runPlp function.

typeColumn

The name of the column specifying the evaluation type

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotSparseRoc(
+  plpResult,
+  typeColumn = "evaluation",
+  saveLocation = NULL,
+  fileName = "roc.png"
+)
+
+ +
+

Arguments

+
plpResult
+

A plp result object as generated using the runPlp function.

+ + +
typeColumn
+

The name of the column specifying the evaluation type

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the Receiver Operator Characteristics (ROC) curve.

+
+
-
- +
- - + + diff --git a/docs/reference/plotVariableScatterplot.html b/docs/reference/plotVariableScatterplot.html index a2934945a..752c4e840 100644 --- a/docs/reference/plotVariableScatterplot.html +++ b/docs/reference/plotVariableScatterplot.html @@ -1,67 +1,12 @@ - - - - - - - -Plot the variable importance scatterplot — plotVariableScatterplot • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Plot the variable importance scatterplot — plotVariableScatterplot • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,66 +131,65 @@

Plot the variable importance scatterplot

Plot the variable importance scatterplot

-
plotVariableScatterplot(
-  covariateSummary,
-  saveLocation = NULL,
-  fileName = "VariableScatterplot.png"
-)
- -

Arguments

- - - - - - - - - - - - - - -
covariateSummary

A prediction object as generated using the -runPlp function.

saveLocation

Directory to save plot (if NULL plot is not saved)

fileName

Name of the file to save to plot, for example +

+
plotVariableScatterplot(
+  covariateSummary,
+  saveLocation = NULL,
+  fileName = "VariableScatterplot.png"
+)
+
+ +
+

Arguments

+
covariateSummary
+

A prediction object as generated using the +runPlp function.

+ + +
saveLocation
+

Directory to save plot (if NULL plot is not saved)

+ + +
fileName
+

Name of the file to save to plot, for example 'plot.png'. See the function ggsave in the ggplot2 package for -supported file formats.

+supported file formats.

-

Value

+
+
+

Value

+ -

A ggplot object. Use the ggsave function to save to file in a different +

A ggplot object. Use the ggsave function to save to file in a different format.

-

Details

- +
+
+

Details

Create a plot showing the variable importance scatterplot #'

+
+
-
- +
- - + + diff --git a/docs/reference/plpDataSimulationProfile.html b/docs/reference/plpDataSimulationProfile.html index fe6c92245..cdf677259 100644 --- a/docs/reference/plpDataSimulationProfile.html +++ b/docs/reference/plpDataSimulationProfile.html @@ -1,67 +1,12 @@ - - - - - - - -A simulation profile — plpDataSimulationProfile • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -A simulation profile — plpDataSimulationProfile • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,47 +131,54 @@

A simulation profile

A simulation profile

-
data(plpDataSimulationProfile)
+
+
data(plpDataSimulationProfile)
+
+
+

Format

+

A data frame containing the following elements:

covariatePrevalence
+

prevalence of all covariates

-

Format

+
outcomeModels
+

regression model parameters to simulate outcomes

-

A data frame containing the following elements:

-
covariatePrevalence

prevalence of all covariates

-
outcomeModels

regression model parameters to simulate outcomes

-
metaData

settings used to simulate the profile

-
covariateRef

covariateIds and covariateNames

-
timePrevalence

time window

-
exclusionPrevalence

prevalence of exclusion of covariates

+
metaData
+

settings used to simulate the profile

-
+
covariateRef
+

covariateIds and covariateNames

+
timePrevalence
+

time window

+ +
exclusionPrevalence
+

prevalence of exclusion of covariates

+ + +
+
-
- +
- - + + diff --git a/docs/reference/populatePlpResultTables.html b/docs/reference/populatePlpResultTables.html deleted file mode 100644 index 09b0689e3..000000000 --- a/docs/reference/populatePlpResultTables.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - -Populate the PatientLevelPrediction results tables — populatePlpResultTables • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

This function formats and uploads results that have been generated via an ATLAS prediction package into a database

-
- -
populatePlpResultTables(
-  conn,
-  resultSchema,
-  stringAppendToTables = "",
-  targetDialect = "postgresql",
-  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
-  packageName,
-  studyJsonList,
-  studyName = "",
-  studyDescription = "",
-  researcherName = "",
-  researcherEmail = "",
-  researcherOrg = "",
-  databaseName = NULL,
-  databaseAcronym = NULL,
-  databaseVersion = 1,
-  databaseDescription = NULL,
-  databaseType = NULL,
-  valDatabases = list(ccae = list(name = "CCAE", description = "", version = 1, type =
-    "US Claims")),
-  resultLocation = NULL,
-  resultPattern = "",
-  validationLocation = file.path(resultLocation, "Validation"),
-  addInternalValidation = T,
-  addExternalValidation = T,
-  gsubVal = NULL,
-  removePattern = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
conn

A connection to a database created by using the -function connect in the -DatabaseConnector package.

resultSchema

(string) The name of the database schema that the result tables will be created.

stringAppendToTables

(string) A string that appends to the PatientLevelPrediction result tables

targetDialect

(string) The database management system being used

tempEmulationSchema

(string) The temp schema used when the database management system is oracle

packageName

(string) The name of the ATLAS R package used to generate the results (this is used to extract cohort jsons)

studyJsonList

(list) A list of lists per cohort with the cohort_name, cohort_id and cohort_json

studyName

(string) A reference study name

studyDescription

(string) A description of the study

researcherName

(string) Name of the researcher who developed the study

researcherEmail

(string) Email of the researcher who developed the study

researcherOrg

(string) Organisation of the researcher who developed the study

databaseName

(string) name of the database used to develop the model/s

databaseAcronym

(string) acronym of the database used to develop the model/s

databaseVersion

(int) Version of the database used to develop the model/s

databaseDescription

(string) Description of the database used to develop the model/s

databaseType

(string) Type of the database used to develop the model/s (e.g., claims)

valDatabases

(list) A named list with details of the external validation databases. Needs to contain: name, description, version, type.

resultLocation

(string) location of directory where the main package results were saved

resultPattern

(string) A string to match to select models of interest

validationLocation

(string) location of directory where the validation package results were saved

addInternalValidation

(boolean) Whether the internval validation results should be uploaded

addExternalValidation

(boolean) Whether the externval validation results should be uploaded

gsubVal

(string) Remove patterns from the result name

removePattern

(string) Restrict to result names with this pattern

- -

Value

- -

Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema

-

Details

- -

This function can be used upload PatientLevelPrediction results into a database

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/positiveLikelihoodRatio.html b/docs/reference/positiveLikelihoodRatio.html index 0c19aa720..15d6fb14f 100644 --- a/docs/reference/positiveLikelihoodRatio.html +++ b/docs/reference/positiveLikelihoodRatio.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the positiveLikelihoodRatio — positiveLikelihoodRatio • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the positiveLikelihoodRatio — positiveLikelihoodRatio • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the positiveLikelihoodRatio

Calculate the positiveLikelihoodRatio

-
positiveLikelihoodRatio(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

positiveLikelihoodRatio value

-

Details

+
+
positiveLikelihoodRatio(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

positiveLikelihoodRatio value

+
+
+

Details

Calculate the positiveLikelihoodRatio

+
+
-
- +
- - + + diff --git a/docs/reference/positivePredictiveValue.html b/docs/reference/positivePredictiveValue.html index 940084bea..8e32622a2 100644 --- a/docs/reference/positivePredictiveValue.html +++ b/docs/reference/positivePredictiveValue.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the positivePredictiveValue — positivePredictiveValue • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the positivePredictiveValue — positivePredictiveValue • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the positivePredictiveValue

Calculate the positivePredictiveValue

-
positivePredictiveValue(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

positivePredictiveValue value

-

Details

+
+
positivePredictiveValue(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

positivePredictiveValue value

+
+
+

Details

Calculate the positivePredictiveValue

+
+
-
- +
- - + + diff --git a/docs/reference/predictAndromeda.html b/docs/reference/predictAndromeda.html deleted file mode 100644 index 96470b65b..000000000 --- a/docs/reference/predictAndromeda.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - -Generated predictions from a regression model — predictAndromeda • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Generated predictions from a regression model

-
- -
predictAndromeda(
-  coefficients,
-  population,
-  covariateData,
-  modelType = "logistic"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
coefficients

A names numeric vector where the names are the covariateIds, except for the -first value which is expected to be the intercept.

population

A data frame containing the population to do the prediction for

covariateData

An andromeda object containing the covariateData with predefined columns -(see below).

modelType

Current supported types are "logistic", "poisson", "cox" or "survival".

- -

Details

- -

These columns are expected in the outcome object:

- - - -
rowId(integer)Row ID is used to link multiple covariates (x) to a single outcome (y)
time(real)For models that use time (e.g. Poisson or Cox regression) this contains time
(e.g. number of days)
-

These columns are expected in the covariates object:

- - - -
rowId(integer)Row ID is used to link multiple covariates (x) to a single outcome -(y)
covariateId(integer)A numeric identifier of a covariate
covariateValue(real)The value of the specified covariate
- - - -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/predictCyclops.html b/docs/reference/predictCyclops.html index 3a21d5630..8482299c3 100644 --- a/docs/reference/predictCyclops.html +++ b/docs/reference/predictCyclops.html @@ -1,67 +1,12 @@ - - - - - - - -Create predictive probabilities — predictCyclops • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create predictive probabilities — predictCyclops • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,59 +131,58 @@

Create predictive probabilities

Create predictive probabilities

-
predictCyclops(plpModel, data, cohort)
- -

Arguments

- - - - - - - - - - - - - - -
plpModel

An object of type predictiveModel as generated using -fitPlp.

data

The new plpData containing the covariateData for the new population

cohort

The cohort to calculate the prediction for

- -

Value

- -

The value column in the result data.frame is: logistic: probabilities of the outcome, poisson: -Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.

-

Details

+
+
predictCyclops(plpModel, data, cohort)
+
+ +
+

Arguments

+
plpModel
+

An object of type predictiveModel as generated using +fitPlp.

+ + +
data
+

The new plpData containing the covariateData for the new population

+ + +
cohort
+

The cohort to calculate the prediction for

+
+
+

Value

+ + +

The value column in the result data.frame is: logistic: probabilities of the outcome, poisson: +Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.

+
+
+

Details

Generates predictions for the population specified in plpData given the model.

+
+
-
- +
- - + + diff --git a/docs/reference/predictFfdf.html b/docs/reference/predictFfdf.html deleted file mode 100644 index 2affbc06c..000000000 --- a/docs/reference/predictFfdf.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Generated predictions from a regression model — predictFfdf • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Generated predictions from a regression model

- -
- -
predictFfdf(coefficients, population, covariates, modelType = "logistic")
- -

Arguments

- - - - - - - - - - - - - - - - - - -
coefficients

A names numeric vector where the names are the covariateIds, except for the -first value which is expected to be the intercept.

population

A data frame containing the population to do the prediction for

covariates

A data frame or ffdf object containing the covariates with predefined columns -(see below).

modelType

Current supported types are "logistic", "poisson", "cox" or "survival".

- -

Details

- -

These columns are expected in the outcome object:

- - - - - -
rowId
(integer)Row ID is used to link multiple covariates (x) to a single outcome (y)
time
(real)For models that use time (e.g. Poisson or Cox regression) this contains time
(e.g. number of days)
-

These columns are expected in the covariates object:

- - - - - - -
rowId
(integer)Row ID is used to link multiple covariates (x) to a single outcome(y)
covariateId
(integer)A numeric identifier of a covariate
covariateValue(real)
The value of the specified covariate
- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/predictPlp.html b/docs/reference/predictPlp.html index dc81d5d0a..1765cf0ec 100644 --- a/docs/reference/predictPlp.html +++ b/docs/reference/predictPlp.html @@ -1,67 +1,12 @@ - - - - - - - -predictPlp — predictPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -predictPlp — predictPlp • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,62 +131,61 @@

predictPlp

Predict the risk of the outcome using the input plpModel for the input plpData

-
predictPlp(plpModel, plpData, population, timepoint)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpModel

An object of type plpModel - a patient level prediction model

plpData

An object of type plpData - the patient level prediction -data extracted from the CDM.

population

The population created using createStudyPopulation() who will have their risks predicted or a cohort without the outcome known

timepoint

The timepoint to predict risk (survival models only)

- -

Value

- -

A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details.

-

Details

+
+
predictPlp(plpModel, plpData, population, timepoint)
+
+
+

Arguments

+
plpModel
+

An object of type plpModel - a patient level prediction model

+ + +
plpData
+

An object of type plpData - the patient level prediction +data extracted from the CDM.

+ + +
population
+

The population created using createStudyPopulation() who will have their risks predicted or a cohort without the outcome known

+ + +
timepoint
+

The timepoint to predict risk (survival models only)

+ +
+
+

Value

+ + +

A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details.

+
+
+

Details

The function applied the trained model on the plpData to make predictions

+
+
-
- +
- - + + diff --git a/docs/reference/predictProbabilities.html b/docs/reference/predictProbabilities.html deleted file mode 100644 index 6fbd0a8c7..000000000 --- a/docs/reference/predictProbabilities.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - - -Create predictive probabilities — predictProbabilities • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create predictive probabilities

-
- -
predictProbabilities(predictiveModel, population, covariateData)
- -

Arguments

- - - - - - - - - - - - - - -
predictiveModel

An object of type predictiveModel as generated using -fitPlp.

population

The population to calculate the prediction for

covariateData

The covariateData containing the covariates for the population

- -

Value

- -

The value column in the result data.frame is: logistic: probabilities of the outcome, poisson: -Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.

-

Details

- -

Generates predictions for the population specified in plpData given the model.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/preprocessData.html b/docs/reference/preprocessData.html index c6e79957d..c8c042505 100644 --- a/docs/reference/preprocessData.html +++ b/docs/reference/preprocessData.html @@ -1,70 +1,15 @@ - - - - - - - -A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data -and remove rare or redundant features — preprocessData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -A function that wraps around FeatureExtraction::tidyCovariateData to normalise the data +and remove rare or redundant features — preprocessData • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -188,54 +136,53 @@

A function that wraps around FeatureExtraction::tidyCovariateData to normali and remove rare or redundant features

-
preprocessData(covariateData, preprocessSettings)
+
+
preprocessData(covariateData, preprocessSettings)
+
-

Arguments

- - - - - - - - - - -
covariateData

The covariate part of the training data created by splitData after being sampled and having -any required feature engineering

preprocessSettings

The settings for the preprocessing created by createPreprocessSettings

+
+

Arguments

+
covariateData
+

The covariate part of the training data created by splitData after being sampled and having +any required feature engineering

-

Value

-

The data processed

-

Details

+
preprocessSettings
+

The settings for the preprocessing created by createPreprocessSettings

+
+
+

Value

+ + +

The data processed

+
+
+

Details

Returns an object of class covariateData that has been processed

+
+
-
- +
- - + + diff --git a/docs/reference/randomSplitter.html b/docs/reference/randomSplitter.html deleted file mode 100644 index 3f91292a3..000000000 --- a/docs/reference/randomSplitter.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - -Split data into random subsets stratified by class — randomSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Split data into random subsets stratified by class

-
- -
randomSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
population

An object created using createStudyPopulation().

test

A real number between 0 and 1 indicating the test set fraction of the data

train

A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

nfold

An integer >= 1 specifying the number of folds used in cross validation

seed

If set a fixed seed is used, otherwise a random split is performed

- -

Value

- -

A dataframe containing the columns: rowId and index

-

Details

- -

Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/recalibratePlp.html b/docs/reference/recalibratePlp.html index d789c3085..d9fb6af46 100644 --- a/docs/reference/recalibratePlp.html +++ b/docs/reference/recalibratePlp.html @@ -1,67 +1,12 @@ - - - - - - - -recalibratePlp — recalibratePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -recalibratePlp — recalibratePlp • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,67 +131,66 @@

recalibratePlp

Train various models using a default parameter gird search or user specified parameters

-
recalibratePlp(
-  prediction,
-  analysisId,
-  typeColumn = "evaluationType",
-  method = c("recalibrationInTheLarge", "weakRecalibration")
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
prediction

A prediction dataframe

analysisId

The model analysisId

typeColumn

The column name where the strata types are specified

method

Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )

- -

Value

- -

An object of class runPlp that is recalibrated on the new data

-

Details

+
+
recalibratePlp(
+  prediction,
+  analysisId,
+  typeColumn = "evaluationType",
+  method = c("recalibrationInTheLarge", "weakRecalibration")
+)
+
+
+

Arguments

+
prediction
+

A prediction dataframe

+ + +
analysisId
+

The model analysisId

+ + +
typeColumn
+

The column name where the strata types are specified

+ + +
method
+

Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )

+ +
+
+

Value

+ + +

An object of class runPlp that is recalibrated on the new data

+
+
+

Details

The user can define the machine learning model to train (regularised logistic regression, random forest, gradient boosting machine, neural network and )

+
+
-
- +
- - + + diff --git a/docs/reference/recalibratePlpRefit.html b/docs/reference/recalibratePlpRefit.html index 2af353ae8..f9338fe9b 100644 --- a/docs/reference/recalibratePlpRefit.html +++ b/docs/reference/recalibratePlpRefit.html @@ -1,67 +1,12 @@ - - - - - - - -recalibratePlpRefit — recalibratePlpRefit • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -recalibratePlpRefit — recalibratePlpRefit • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,59 +131,58 @@

recalibratePlpRefit

Train various models using a default parameter gird search or user specified parameters

-
recalibratePlpRefit(plpModel, newPopulation, newData)
- -

Arguments

- - - - - - - - - - - - - - -
plpModel

The trained plpModel (runPlp$model)

newPopulation

The population created using createStudyPopulation() who will have their risks predicted

newData

An object of type plpData - the patient level prediction -data extracted from the CDM.

- -

Value

- -

An object of class runPlp that is recalibrated on the new data

-

Details

+
+
recalibratePlpRefit(plpModel, newPopulation, newData)
+
+ +
+

Arguments

+
plpModel
+

The trained plpModel (runPlp$model)

+ + +
newPopulation
+

The population created using createStudyPopulation() who will have their risks predicted

+ +
newData
+

An object of type plpData - the patient level prediction +data extracted from the CDM.

+ +
+
+

Value

+ + +

An object of class runPlp that is recalibrated on the new data

+
+
+

Details

The user can define the machine learning model to train (regularised logistic regression, random forest, gradient boosting machine, neural network and )

+
+
-
- +
- - + + diff --git a/docs/reference/registerParallelBackend.html b/docs/reference/registerParallelBackend.html deleted file mode 100644 index 4d84ea497..000000000 --- a/docs/reference/registerParallelBackend.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - -registerParallelBackend — registerParallelBackend • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Registers a parallel backend for multi core processing. The -number of cores will be detected automatically, unless specified otherwise.

-
- -
registerParallelBackend(cores = NULL, logical = TRUE)
- -

Arguments

- - - - - - - - - - -
cores

the number of cores to use for multi core processing

logical

whether to consider logical or physical cores

- - -

Examples

-
if (FALSE) { -# detect logical cores automatically -registerParallelBackend() - -# use four physical cores -numCores <- 4 -registerParallelBackend(numCores, logical = FALSE) -}
-
- -
- - -
- - -
-

Site built with pkgdown 1.5.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/registerSequentialBackend.html b/docs/reference/registerSequentialBackend.html deleted file mode 100644 index 946d45c40..000000000 --- a/docs/reference/registerSequentialBackend.html +++ /dev/null @@ -1,212 +0,0 @@ - - - - - - - - -registerSequentialBackend — registerSequentialBackend • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

registerSequentialBackend registers a sequential backend for -single core processing.

-
- -
registerSequentialBackend()
- - - -

Examples

-
if (FALSE) { -# register a sequential backend -registerSequentialBackend() -}
-
- -
- - -
- - -
-

Site built with pkgdown 1.5.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/runEnsembleModel.html b/docs/reference/runEnsembleModel.html deleted file mode 100644 index 5085dccb2..000000000 --- a/docs/reference/runEnsembleModel.html +++ /dev/null @@ -1,333 +0,0 @@ - - - - - - - - -ensemble - Create an ensembling model using different models — runEnsembleModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

#'

-
- -
runEnsembleModel(
-  population,
-  dataList,
-  modelList,
-  testSplit = "time",
-  testFraction = 0.2,
-  stackerUseCV = TRUE,
-  splitSeed = NULL,
-  nfold = 3,
-  saveDirectory = NULL,
-  saveEnsemble = F,
-  savePlpData = F,
-  savePlpResult = F,
-  savePlpPlots = F,
-  saveEvaluation = F,
-  analysisId = NULL,
-  verbosity = "INFO",
-  ensembleStrategy = "mean",
-  cores = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
population

The population created using createStudyPopulation() who will be used to -develop the model

dataList

An list of object of type plpData - the patient level prediction -data extracted from the CDM.

modelList

An list of type of base model created using one of the function in final -ensembling model, the base model can be any model implemented in this -package.

testSplit

Either 'person' or 'time' specifying the type of evaluation used. 'time' -find the date where testFraction of patients had an index after the date -and assigns patients with an index prior to this date into the training -set and post the date into the test set 'person' splits the data into test -(1-testFraction of the data) and train (validationFraction of the data) -sets. The split is stratified by the class label.

testFraction

The fraction of the data to be used as the test set in the patient split -evaluation.

stackerUseCV

When doing stacking you can either use the train CV predictions to train the stacker (TRUE) or leave 20 percent of the data to train the stacker

splitSeed

The seed used to split the test/train set when using a person type -testSplit

nfold

The number of folds used in the cross validation (default 3)

saveDirectory

The path to the directory where the results will be saved (if NULL uses working directory)

saveEnsemble

Binary indicating whether to save the ensemble

savePlpData

Binary indicating whether to save the plpData object (default is F)

savePlpResult

Binary indicating whether to save the object returned by runPlp (default is F)

savePlpPlots

Binary indicating whether to save the performance plots as pdf files (default is F)

saveEvaluation

Binary indicating whether to save the oerformance as csv files (default is T)

analysisId

The analysis ID

verbosity

Sets the level of the verbosity. If the log level is at or higher in -priority than the logger threshold, a message will print. The levels are:

    -
  • DEBUGHighest verbosity showing all debug statements

  • -
  • TRACEShowing information about start and end of steps

  • -
  • INFOShow informative information (Default)

  • -
  • WARNShow warning messages

  • -
  • ERRORShow error messages

  • -
  • FATALBe silent except for fatal errors

  • -
ensembleStrategy

The strategy used for ensembling the outputs from different models, it can -be 'mean', 'product', 'weighted' and 'stacked' 'mean' the average -probability from differnt models 'product' the product rule 'weighted' the -weighted average probability from different models using train AUC as -weights. 'stacked' the stakced ensemble trains a logistics regression on -different models.

cores

The number of cores to use when training the ensemble

- -

Details

- -

This function applied a list of models and combines them into an ensemble model

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/runMultiplePlp.html b/docs/reference/runMultiplePlp.html index 2ed6463a2..5e36a1c9f 100644 --- a/docs/reference/runMultiplePlp.html +++ b/docs/reference/runMultiplePlp.html @@ -1,67 +1,12 @@ - - - - - - - -Run a list of predictions analyses — runMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Run a list of predictions analyses — runMultiplePlp • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,95 +131,83 @@

Run a list of predictions analyses

Run a list of predictions analyses

-
runMultiplePlp(
-  databaseDetails = createDatabaseDetails(),
-  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
-    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
-    modelSettings = setLassoLogisticRegression())),
-  onlyFetchData = F,
-  splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
-    trainFraction = 0.75, splitSeed = 123, nfold = 3),
-  cohortDefinitions = NULL,
-  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
-    "runPlp Log"),
-  saveDirectory = getwd()
-)
+
+
runMultiplePlp(
+  databaseDetails = createDatabaseDetails(),
+  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
+    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
+    modelSettings = setLassoLogisticRegression())),
+  onlyFetchData = F,
+  cohortDefinitions = NULL,
+  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+    "runPlp Log"),
+  saveDirectory = getwd(),
+  sqliteLocation = file.path(saveDirectory, "sqlite")
+)
+
-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
databaseDetails

The database settings created using createDatabaseDetails()

modelDesignList

A list of model designs created using createModelDesign()

onlyFetchData

Only fetches and saves the data object to the output folder without running the analysis.

splitSettings

The train/validation/test splitting used by all analyses created using createDefaultSplitSetting()

cohortDefinitions

A list of cohort definitions for the target and outcome cohorts

logSettings

The setting spexcifying the logging for the analyses created using createLogSettings()

saveDirectory

Name of the folder where all the outputs will written to.

+
+

Arguments

+
databaseDetails
+

The database settings created using createDatabaseDetails()

-

Value

-

A data frame with the following columns:

- - - - - - -
analysisIdThe unique identifier -for a set of analysis choices.
cohortIdThe ID of the target cohort populations.
outcomeIdThe ID of the outcomeId.
dataLocationThe location where the plpData was saved
evaluationFolderThe name of file containing the evaluation saved as a csv
the settings idsThe ids for all other settings used for model development.
+
modelDesignList
+

A list of model designs created using createModelDesign()

-

Details

+
onlyFetchData
+

Only fetches and saves the data object to the output folder without running the analysis.

+ +
cohortDefinitions
+

A list of cohort definitions for the target and outcome cohorts

+ + +
logSettings
+

The setting specifying the logging for the analyses created using createLogSettings()

+ + +
saveDirectory
+

Name of the folder where all the outputs will written to.

+ + +
sqliteLocation
+

(optional) The location of the sqlite database with the results

+ +
+
+

Value

+ + +

A data frame with the following columns:

analysisIdThe unique identifier +for a set of analysis choices.
targetIdThe ID of the target cohort populations.
outcomeIdThe ID of the outcomeId.
dataLocationThe location where the plpData was saved
the settings idsThe ids for all other settings used for model development.
+
+

Details

This function will run all specified predictions as defined using .

+
+
-
- +
- - + + diff --git a/docs/reference/runPlp.html b/docs/reference/runPlp.html index d3a6a999e..2eb237596 100644 --- a/docs/reference/runPlp.html +++ b/docs/reference/runPlp.html @@ -1,72 +1,17 @@ - - - - - - - -runPlp - Develop and internally evaluate a model using specified settings — runPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -runPlp - Develop and internally evaluate a model using specified settings — runPlp • PatientLevelPrediction - - - - - - - - - - - + + - - -
-
- -
- -
+
@@ -193,145 +141,139 @@

runPlp - Develop and internally evaluate a model using specified settings

-
runPlp(
-  plpData,
-  outcomeId = plpData$metaData$call$outcomeIds[1],
-  analysisId = paste(Sys.Date(), plpData$metaData$call$outcomeIds[1], sep = "-"),
-  analysisName = "Study details",
-  populationSettings = createStudyPopulationSettings(),
-  splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
-    trainFraction = 0.75, splitSeed = 123, nfold = 3),
-  sampleSettings = createSampleSettings(type = "none"),
-  featureEngineeringSettings = createFeatureEngineeringSettings(type = "none"),
-  preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
-  modelSettings = setLassoLogisticRegression(),
-  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
-    "runPlp Log"),
-  executeSettings = createDefaultExecuteSettings(),
-  saveDirectory = getwd()
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData - the patient level prediction -data extracted from the CDM.

outcomeId

(integer) The ID of the outcome.

analysisId

(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

analysisName

(character) Name for the analysis

populationSettings

An object of type populationSettings created using createStudyPopulationSettings that +

+
runPlp(
+  plpData,
+  outcomeId = plpData$metaData$call$outcomeIds[1],
+  analysisId = paste(Sys.Date(), plpData$metaData$call$outcomeIds[1], sep = "-"),
+  analysisName = "Study details",
+  populationSettings = createStudyPopulationSettings(),
+  splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
+    trainFraction = 0.75, splitSeed = 123, nfold = 3),
+  sampleSettings = createSampleSettings(type = "none"),
+  featureEngineeringSettings = createFeatureEngineeringSettings(type = "none"),
+  preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
+  modelSettings = setLassoLogisticRegression(),
+  logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+    "runPlp Log"),
+  executeSettings = createDefaultExecuteSettings(),
+  saveDirectory = getwd()
+)
+
+ +
+

Arguments

+
plpData
+

An object of type plpData - the patient level prediction +data extracted from the CDM. Can also include an initial population as +plpData$popualtion.

+ + +
outcomeId
+

(integer) The ID of the outcome.

+ + +
analysisId
+

(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.

+ + +
analysisName
+

(character) Name for the analysis

+ + +
populationSettings
+

An object of type populationSettings created using createStudyPopulationSettings that specifies how the data class labels are defined and addition any exclusions to apply to the -plpData cohort

splitSettings

An object of type splitSettings that specifies how to split the data into train/validation/test. -The default settings can be created using createDefaultSplitSetting.

sampleSettings

An object of type sampleSettings that specifies any under/over sampling to be done. -The default is none.

featureEngineeringSettings

An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

preprocessSettings

An object of preprocessSettings. This setting specifies the minimum fraction of +plpData cohort

+ + +
splitSettings
+

An object of type splitSettings that specifies how to split the data into train/validation/test. +The default settings can be created using createDefaultSplitSetting.

+ + +
sampleSettings
+

An object of type sampleSettings that specifies any under/over sampling to be done. +The default is none.

+ + +
featureEngineeringSettings
+

An object of featureEngineeringSettings specifying any feature engineering to be learned (using the train data)

+ + +
preprocessSettings
+

An object of preprocessSettings. This setting specifies the minimum fraction of target population who must have a covariate for it to be included in the model training -and whether to normalise the covariates before training

modelSettings

An object of class modelSettings created using one of the function:

    -
  • setLassoLogisticRegression() A lasso logistic regression model

  • +and whether to normalise the covariates before training

    + + +
    modelSettings
    +

    An object of class modelSettings created using one of the function:

    • setLassoLogisticRegression() A lasso logistic regression model

    • setGradientBoostingMachine() A gradient boosting machine

    • setAdaBoost() An ada boost model

    • setRandomForest() A random forest model

    • setDecisionTree() A decision tree model

    • -
    • setCovNN()) A convolutional neural network model

    • -
    • setCIReNN() A recurrent neural network model

    • -
    • setMLP() A neural network model

    • -
    • setDeepNN() A deep neural network model

    • setKNN() A KNN model

    • -
logSettings

An object of logSettings created using createLogSettings -specifying how the logging is done

executeSettings

An object of executeSettings specifying which parts of the analysis to run

saveDirectory

The path to the directory where the results will be saved (if NULL uses working directory)

- -

Value

- -

An object containing the following:

-

    -
  • inputSettingsA list containing all the settings used to develop the model

  • -
  • model The developed model of class plpModel

  • +
+ + +
logSettings
+

An object of logSettings created using createLogSettings +specifying how the logging is done

+ + +
executeSettings
+

An object of executeSettings specifying which parts of the analysis to run

+ + +
saveDirectory
+

The path to the directory where the results will be saved (if NULL uses working directory)

+ +
+
+

Value

+ + +

An object containing the following:

+

+
  • model The developed model of class plpModel

  • executionSummary A list containing the hardward details, R package details and execution time

  • performanceEvaluation Various internal performance metrics in sparse format

  • prediction The plpData cohort table with the predicted risks added as a column (named value)

  • -
  • covariateSummary) A characterization of the features for patients with and without the outcome during the time at risk

  • +
  • covariateSummary A characterization of the features for patients with and without the outcome during the time at risk

  • analysisRef A list with details about the analysis

  • -
- -

Details

- +
+
+

Details

This function takes as input the plpData extracted from an OMOP CDM database and follows the specified settings to develop and internally validate a model for the specified outcomeId.

+
-

Examples

-

+    
+

Examples

+ +
+
-
- +
- - + + diff --git a/docs/reference/runPlpAnalyses.html b/docs/reference/runPlpAnalyses.html deleted file mode 100644 index 4ab0d4ed9..000000000 --- a/docs/reference/runPlpAnalyses.html +++ /dev/null @@ -1,383 +0,0 @@ - - - - - - - - -Run a list of predictions — runPlpAnalyses • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Run a list of predictions

-
- -
runPlpAnalyses(
-  connectionDetails,
-  cdmDatabaseSchema,
-  cdmDatabaseName,
-  oracleTempSchema = cdmDatabaseSchema,
-  cohortDatabaseSchema = cdmDatabaseSchema,
-  cohortTable = "cohort",
-  outcomeDatabaseSchema = cdmDatabaseSchema,
-  outcomeTable = "cohort",
-  cdmVersion = 5,
-  onlyFetchData = FALSE,
-  outputFolder = "./PlpOutput",
-  modelAnalysisList,
-  cohortIds,
-  cohortNames,
-  outcomeIds,
-  outcomeNames,
-  washoutPeriod = 0,
-  maxSampleSize = NULL,
-  minCovariateFraction = 0,
-  normalizeData = T,
-  testSplit = "person",
-  testFraction = 0.25,
-  splitSeed = NULL,
-  nfold = 3,
-  verbosity = "INFO",
-  settings = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
connectionDetails

An R object of type connectionDetails created using the -function createConnectionDetails in the -DatabaseConnector package.

cdmDatabaseSchema

The name of the database schema that contains the OMOP CDM -instance. Requires read permissions to this database. On SQL -Server, this should specifiy both the database and the schema, -so for example 'cdm_instance.dbo'.

cdmDatabaseName

A string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported)

oracleTempSchema

For Oracle only: the name of the database schema where you -want all temporary tables to be managed. Requires -create/insert permissions to this database.

cohortDatabaseSchema

The name of the database schema that is the location where the -target cohorts are available. Requires read -permissions to this database.

cohortTable

The tablename that contains the target cohorts. Expectation is cohortTable -has format of COHORT table: COHORT_DEFINITION_ID, SUBJECT_ID, -COHORT_START_DATE, COHORT_END_DATE.

outcomeDatabaseSchema

The name of the database schema that is the location where the -data used to define the outcome cohorts is available. Requires read permissions to -this database.

outcomeTable

The tablename that contains the outcome cohorts. Expectation is -outcomeTable has format of COHORT table: COHORT_DEFINITION_ID, -SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE.

cdmVersion

Define the OMOP CDM version used: currently support "4" and -"5".

onlyFetchData

Only fetches and saves the data object to the output folder without running the analysis.

outputFolder

Name of the folder where all the outputs will written to.

modelAnalysisList

A list of objects of type modelSettings as created using -the createPlpModelSettings function.

cohortIds

A vector of cohortIds that specify all the target cohorts

cohortNames

A vector of cohortNames corresponding to the cohortIds

outcomeIds

A vector of outcomeIds that specify all the outcome cohorts

outcomeNames

A vector of outcomeNames corresponding to the outcomeIds

washoutPeriod

Minimum number of prior observation days

maxSampleSize

Max number of target people to sample from to develop models

minCovariateFraction

Any covariate with an incidence less than this value if ignored

normalizeData

Whether to normalize the covariates

testSplit

How to split into test/train (time or person)

testFraction

Fraction of data to use as test set

splitSeed

The seed used for the randomization into test/train

nfold

Number of folds used to do cross validation

verbosity

The logging level

settings

Specify the T, O, population, covariate and model settings

- -

Value

- -

A data frame with the following columns:

- - - - - - - -
analysisIdThe unique identifier -for a set of analysis choices.
cohortIdThe ID of the target cohort populations.
outcomeIdThe ID of the outcomeId.
plpDataFolderThe location where the plpData was saved
studyPopFileThe -name of the file containing the study population
evaluationFolderThe name of file containing the evaluation saved as a csv
modelFolderThe name of the file containing the developed model.
- - -

Details

- -

Run a list of predictions for the target cohorts and outcomes of interest. This function will run all -specified predictions, meaning that the total number of outcome -models is `length(cohortIds) * length(outcomeIds) * length(modelAnalysisList)`.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/saveEnsemblePlpModel.html b/docs/reference/saveEnsemblePlpModel.html deleted file mode 100644 index 18b08bff5..000000000 --- a/docs/reference/saveEnsemblePlpModel.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -saves the Ensmeble plp model — saveEnsemblePlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

saves the Ensmeble plp model

-
- -
saveEnsemblePlpModel(ensembleModel, dirPath)
- -

Arguments

- - - - - - - - - - -
ensembleModel

The ensemble model to save

dirPath

The location to save the model

- -

Details

- -

Saves a plp ensemble model

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/saveEnsemblePlpResult.html b/docs/reference/saveEnsemblePlpResult.html deleted file mode 100644 index 323846315..000000000 --- a/docs/reference/saveEnsemblePlpResult.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -saves the Ensemble plp results — saveEnsemblePlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

saves the Ensemble plp results

-
- -
saveEnsemblePlpResult(ensembleResult, dirPath)
- -

Arguments

- - - - - - - - - - -
ensembleResult

The ensemble result

dirPath

The location to save the ensemble results

- -

Details

- -

Saves a plp ensemble results

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/savePlpAnalysesJson.html b/docs/reference/savePlpAnalysesJson.html index d5d6cc2aa..675b05e57 100644 --- a/docs/reference/savePlpAnalysesJson.html +++ b/docs/reference/savePlpAnalysesJson.html @@ -1,67 +1,12 @@ - - - - - - - -Save the modelDesignList to a json file — savePlpAnalysesJson • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Save the modelDesignList to a json file — savePlpAnalysesJson • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,67 +131,70 @@

Save the modelDesignList to a json file

Save the modelDesignList to a json file

-
savePlpAnalysesJson(
-  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
-    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
-    modelSettings = setLassoLogisticRegression())),
-  saveDirectory = NULL
-)
- -

Arguments

- - - - - - - - - - -
modelDesignList

A list of modelDesigns created using createModelDesign()

saveDirectory

The directory to save the modelDesignList settings

- -

Details

+
+
savePlpAnalysesJson(
+  modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
+    setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
+    modelSettings = setLassoLogisticRegression())),
+  cohortDefinitions = NULL,
+  saveDirectory = NULL
+)
+
+ +
+

Arguments

+
modelDesignList
+

A list of modelDesigns created using createModelDesign()

+ + +
cohortDefinitions
+

A list of the cohortDefinitions (generally extracted from ATLAS)

+ +
saveDirectory
+

The directory to save the modelDesignList settings

+ +
+
+

Details

This function creates a json file with the modelDesignList saved

+
-

Examples

-
if (FALSE) { -savePlpAnalysesJson( -modelDesignList = list( -createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), -createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression()) -), -saveDirectory = 'C:/bestModels' -) -} - -
+
+

Examples

+
if (FALSE) {
+savePlpAnalysesJson(
+modelDesignList = list(
+createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), 
+createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
+),
+saveDirectory = 'C:/bestModels'
+)
+}
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/savePlpData.html b/docs/reference/savePlpData.html index e31608d82..54a2af0ae 100644 --- a/docs/reference/savePlpData.html +++ b/docs/reference/savePlpData.html @@ -1,67 +1,12 @@ - - - - - - - -Save the cohort data to folder — savePlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Save the cohort data to folder — savePlpData • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,64 +131,63 @@

Save the cohort data to folder

savePlpData saves an object of type plpData to folder.

-
savePlpData(plpData, file, envir = NULL, overwrite = F)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData as generated using -getPlpData.

file

The name of the folder where the data will be written. The folder should -not yet exist.

envir

The environment for to evaluate variables when saving

overwrite

Whether to force overwrite an existing file

- -

Details

+
+
savePlpData(plpData, file, envir = NULL, overwrite = F)
+
+ +
+

Arguments

+
plpData
+

An object of type plpData as generated using +getPlpData.

+ + +
file
+

The name of the folder where the data will be written. The folder should +not yet exist.

+ + +
envir
+

The environment for to evaluate variables when saving

-

The data will be written to a set of files in the folder specified by the user.

-

Examples

-
# todo +
overwrite
+

Whether to force overwrite an existing file

-
+
+
+

Details

+

The data will be written to a set of files in the folder specified by the user.

+
+ +
+

Examples

+
# todo
+
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/savePlpModel.html b/docs/reference/savePlpModel.html index 16ae57e6c..786674942 100644 --- a/docs/reference/savePlpModel.html +++ b/docs/reference/savePlpModel.html @@ -1,67 +1,12 @@ - - - - - - - -Saves the plp model — savePlpModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Saves the plp model — savePlpModel • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,50 +131,46 @@

Saves the plp model

Saves the plp model

-
savePlpModel(plpModel, dirPath)
+
+
savePlpModel(plpModel, dirPath)
+
+ +
+

Arguments

+
plpModel
+

A trained classifier returned by running runPlp()$model

-

Arguments

- - - - - - - - - - -
plpModel

A trained classifier returned by running runPlp()$model

dirPath

A location to save the model to

-

Details

+
dirPath
+

A location to save the model to

+
+
+

Details

Saves the plp model to a user specificed folder

+
+
-
- +
- - + + diff --git a/docs/reference/savePlpResult.html b/docs/reference/savePlpResult.html index 55990afa7..52064bba2 100644 --- a/docs/reference/savePlpResult.html +++ b/docs/reference/savePlpResult.html @@ -1,67 +1,12 @@ - - - - - - - -Saves the result from runPlp into the location directory — savePlpResult • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Saves the result from runPlp into the location directory — savePlpResult • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,50 +131,46 @@

Saves the result from runPlp into the location directory

Saves the result from runPlp into the location directory

-
savePlpResult(result, dirPath)
+
+
savePlpResult(result, dirPath)
+
+ +
+

Arguments

+
result
+

The result of running runPlp()

-

Arguments

- - - - - - - - - - -
result

The result of running runPlp()

dirPath

The directory to save the csv

-

Details

+
dirPath
+

The directory to save the csv

+
+
+

Details

Saves the result from runPlp into the location directory

+
+
-
- +
- - + + diff --git a/docs/reference/savePlpShareable.html b/docs/reference/savePlpShareable.html index 90e5e99e2..828518125 100644 --- a/docs/reference/savePlpShareable.html +++ b/docs/reference/savePlpShareable.html @@ -1,67 +1,12 @@ - - - - - - - -Save the plp result as json files and csv files for transparent sharing — savePlpShareable • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Save the plp result as json files and csv files for transparent sharing — savePlpShareable • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,54 +131,50 @@

Save the plp result as json files and csv files for transparent sharing

Save the plp result as json files and csv files for transparent sharing

-
savePlpShareable(result, saveDirectory, minCellCount = 10)
- -

Arguments

- - - - - - - - - - - - - - -
result

An object of class runPlp with development or validation results

saveDirectory

The directory the save the results as csv files

minCellCount

Minimum cell count for the covariateSummary and certain evaluation results

- -

Details

+
+
savePlpShareable(result, saveDirectory, minCellCount = 10)
+
+ +
+

Arguments

+
result
+

An object of class runPlp with development or validation results

+ + +
saveDirectory
+

The directory the save the results as csv files

+ +
minCellCount
+

Minimum cell count for the covariateSummary and certain evaluation results

+ +
+
+

Details

Saves the main results json/csv files (these files can be read by the shiny app)

+
+
-
- +
- - + + diff --git a/docs/reference/savePlpToCsv.html b/docs/reference/savePlpToCsv.html deleted file mode 100644 index 367a94073..000000000 --- a/docs/reference/savePlpToCsv.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -Save parts of the plp result as a csv for transparent sharing — savePlpToCsv • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Save parts of the plp result as a csv for transparent sharing

-
- -
savePlpToCsv(result, dirPath)
- -

Arguments

- - - - - - - - - - -
result

An object of class runPlp with development or validation results

dirPath

The directory the save the results as csv files

- -

Details

- -

Saves the main results as a csv (these files can be read by the shiny app)

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/savePrediction.html b/docs/reference/savePrediction.html index c6b104576..6ff5a6ff3 100644 --- a/docs/reference/savePrediction.html +++ b/docs/reference/savePrediction.html @@ -1,67 +1,12 @@ - - - - - - - -Saves the prediction dataframe to RDS — savePrediction • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Saves the prediction dataframe to RDS — savePrediction • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,54 +131,50 @@

Saves the prediction dataframe to RDS

Saves the prediction dataframe to RDS

-
savePrediction(prediction, dirPath, fileName = "prediction.rds")
- -

Arguments

- - - - - - - - - - - - - - -
prediction

The prediciton data.frame

dirPath

The directory to save the prediction RDS

fileName

The name of the RDS file that will be saved in dirPath

- -

Details

+
+
savePrediction(prediction, dirPath, fileName = "prediction.rds")
+
+ +
+

Arguments

+
prediction
+

The prediciton data.frame

+ + +
dirPath
+

The directory to save the prediction RDS

+ +
fileName
+

The name of the RDS file that will be saved in dirPath

+ +
+
+

Details

Saves the prediction data frame returned by predict.R to an RDS file and returns the fileLocation where the prediction is saved

+
+
-
- +
- - + + diff --git a/docs/reference/savePredictionAnalysisList.html b/docs/reference/savePredictionAnalysisList.html deleted file mode 100644 index 05ec5e435..000000000 --- a/docs/reference/savePredictionAnalysisList.html +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - - - -Saves a json prediction settings given R settings — savePredictionAnalysisList • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Saves a json prediction settings given R settings

-
- -
savePredictionAnalysisList(
-  workFolder = "inst/settings",
-  cohortIds,
-  outcomeIds,
-  cohortSettingCsv = file.path(workFolder, "CohortsToCreate.csv"),
-  covariateSettingList,
-  populationSettingList,
-  modelSettingList,
-  maxSampleSize = NULL,
-  washoutPeriod = 0,
-  minCovariateFraction = 0,
-  normalizeData = T,
-  testSplit = "person",
-  testFraction = 0.25,
-  splitSeed = 1,
-  nfold = 3
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
workFolder

Location to save json specification

cohortIds

Vector of target population cohort ids

outcomeIds

Vector of outcome cohort ids

cohortSettingCsv

The location to the csv containing the cohort details

covariateSettingList

A list of covariate settings

populationSettingList

A list of population settings

modelSettingList

A list of model settings

maxSampleSize

If not NULL then max number of target population to sample for model training

washoutPeriod

Minimum prior observation for each person in target pop to be included

minCovariateFraction

Minimum covariate fraction to include

normalizeData

Whether to normalise data

testSplit

Split by person or time

testFraction

Fractiuon of data to use for test set

splitSeed

Seed used in test split

nfold

Number of folds used when training model

- -

Details

- -

This function interprets a json with the multiple prediction settings and creates a list -that can be combined with connection settings to run a multiple prediction study

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/sensitivity.html b/docs/reference/sensitivity.html index 8d96e3d1d..512fa41d5 100644 --- a/docs/reference/sensitivity.html +++ b/docs/reference/sensitivity.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the sensitivity — sensitivity • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the sensitivity — sensitivity • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the sensitivity

Calculate the sensitivity

-
sensitivity(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

sensitivity value

-

Details

+
+
sensitivity(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

sensitivity value

+
+
+

Details

Calculate the sensitivity

+
+
-
- +
- - + + diff --git a/docs/reference/setAdaBoost.html b/docs/reference/setAdaBoost.html index ddc12d5d6..2ad4c4aea 100644 --- a/docs/reference/setAdaBoost.html +++ b/docs/reference/setAdaBoost.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for AdaBoost with python DecisionTreeClassifier base estimator — setAdaBoost • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for AdaBoost with python DecisionTreeClassifier base estimator — setAdaBoost • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,68 +131,65 @@

Create setting for AdaBoost with python DecisionTreeClassifier base estimato

Create setting for AdaBoost with python DecisionTreeClassifier base estimator

-
setAdaBoost(
-  nEstimators = list(10, 50, 200),
-  learningRate = list(1, 0.5, 0.1),
-  algorithm = list("SAMME.R"),
-  seed = sample(1e+06, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
nEstimators

(list) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.

learningRate

(list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters -There is a trade-off between learningRate and nEstimators.

algorithm

(list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.

seed

A seed for the model

- - -

Examples

-
if (FALSE) { -model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), - algorithm = list('SAMME.R'), seed = sample(1000000,1) - ) -} -
+
+
setAdaBoost(
+  nEstimators = list(10, 50, 200),
+  learningRate = list(1, 0.5, 0.1),
+  algorithm = list("SAMME.R"),
+  seed = sample(1e+06, 1)
+)
+
+ +
+

Arguments

+
nEstimators
+

(list) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.

+ + +
learningRate
+

(list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters +There is a trade-off between learningRate and nEstimators.

+ + +
algorithm
+

(list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.

+ + +
seed
+

A seed for the model

+ +
+ +
+

Examples

+
if (FALSE) {
+model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1),
+                              algorithm = list('SAMME.R'), seed = sample(1000000,1)
+                              )
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setCIReNN.html b/docs/reference/setCIReNN.html deleted file mode 100644 index f62ecff7e..000000000 --- a/docs/reference/setCIReNN.html +++ /dev/null @@ -1,346 +0,0 @@ - - - - - - - - -Create setting for CIReNN model — setCIReNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for CIReNN model

-
- -
setCIReNN(
-  numberOfRNNLayer = c(1),
-  units = c(128, 64),
-  recurrentDropout = c(0.2),
-  layerDropout = c(0.2),
-  lr = c(1e-04),
-  decay = c(1e-05),
-  outcomeWeight = c(0),
-  batchSize = c(100),
-  epochs = c(100),
-  earlyStoppingMinDelta = c(1e-04),
-  earlyStoppingPatience = c(10),
-  bayes = T,
-  useDeepEnsemble = F,
-  numberOfEnsembleNetwork = 5,
-  useVae = T,
-  vaeDataSamplingProportion = 0.1,
-  vaeValidationSplit = 0.2,
-  vaeBatchSize = 100L,
-  vaeLatentDim = 10L,
-  vaeIntermediateDim = 256L,
-  vaeEpoch = 100L,
-  vaeEpislonStd = 1,
-  useGPU = FALSE,
-  maxGPUs = 2,
-  seed = 1234
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
numberOfRNNLayer

The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3)

units

The number of units of RNN layer - as a list of vectors

recurrentDropout

The reccurrent dropout rate (regularisation)

layerDropout

The layer dropout rate (regularisation)

lr

Learning rate

decay

Learning rate decay over each update.

outcomeWeight

The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight.

batchSize

The number of data points to use per training batch

epochs

Number of times to iterate over dataset

earlyStoppingMinDelta

minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement.

earlyStoppingPatience

Number of epochs with no improvement after which training will be stopped.

bayes

logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer

useDeepEnsemble

logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing.

numberOfEnsembleNetwork

Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5).

useVae

logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN

vaeDataSamplingProportion

Data sampling proportion for VAE

vaeValidationSplit

Validation split proportion for VAE

vaeBatchSize

batch size for VAE

vaeLatentDim

Number of latent dimesion for VAE

vaeIntermediateDim

Number of intermediate dimesion for VAE

vaeEpoch

Number of times to interate over dataset for VAE

vaeEpislonStd

Epsilon

useGPU

logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE

maxGPUs

Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas.

seed

Random seed used by deep learning model

- - -

Examples

-
if (FALSE) { -model.CIReNN <- setCIReNN() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setCNNTorch.html b/docs/reference/setCNNTorch.html deleted file mode 100644 index 798867100..000000000 --- a/docs/reference/setCNNTorch.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Create setting for CNN model with python — setCNNTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for CNN model with python

-
- -
setCNNTorch(
-  nbfilters = c(16, 32),
-  epochs = c(20, 50),
-  seed = 0,
-  class_weight = 0,
-  type = "CNN"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
nbfilters

The number of filters

epochs

The number of epochs

seed

A seed for the model

class_weight

The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

type

It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, -'CNN_MIX', 'ResNet' and 'CNN_MULTI'

- - -

Examples

-
if (FALSE) { -model.cnnTorch <- setCNNTorch() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setCompetingRiskModel.html b/docs/reference/setCompetingRiskModel.html deleted file mode 100644 index 0c452ef69..000000000 --- a/docs/reference/setCompetingRiskModel.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - -Create setting for competing risk model (uses Fine-Gray model in Cyclops) — setCompetingRiskModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for competing risk model (uses Fine-Gray model in Cyclops)

-
- -
setCompetingRiskModel(seed = NULL)
- -

Arguments

- - - - - - -
seed

An option to add a seed when training the model

- - -

Examples

-
model.lr <- setCompetingRiskModel() -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setCovNN.html b/docs/reference/setCovNN.html deleted file mode 100644 index cc442a23a..000000000 --- a/docs/reference/setCovNN.html +++ /dev/null @@ -1,271 +0,0 @@ - - - - - - - - -Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1) — setCovNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)

-
- -
setCovNN(
-  batchSize = 1000,
-  outcomeWeight = 1,
-  lr = 1e-05,
-  decay = 1e-06,
-  dropout = 0,
-  epochs = 10,
-  filters = 3,
-  kernelSize = 10,
-  loss = "binary_crossentropy",
-  seed = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
batchSize

The number of samples to used in each batch during model training

outcomeWeight

The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)

lr

The learning rate

decay

The decay of the learning rate

dropout

[currently not used] the dropout rate for regularisation

epochs

The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)

filters

The number of columns output by each convolution

kernelSize

The number of time dimensions used for each convolution

loss

The loss function implemented

seed

The random seed

- - -

Examples

-
if (FALSE) { -model.CovNN <- setCovNN() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setCovNN2.html b/docs/reference/setCovNN2.html deleted file mode 100644 index bd8db7d28..000000000 --- a/docs/reference/setCovNN2.html +++ /dev/null @@ -1,271 +0,0 @@ - - - - - - - - -Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf — setCovNN2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf

-
- -
setCovNN2(
-  batchSize = 1000,
-  outcomeWeight = 1,
-  lr = 1e-05,
-  decay = 1e-06,
-  dropout = 0,
-  epochs = 10,
-  filters = 3,
-  kernelSize = 10,
-  loss = "binary_crossentropy",
-  seed = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
batchSize

The number of samples to used in each batch during model training

outcomeWeight

The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)

lr

The learning rate

decay

The decay of the learning rate

dropout

[currently not used] the dropout rate for regularisation

epochs

The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)

filters

The number of columns output by each convolution

kernelSize

The number of time dimensions used for each convolution

loss

The loss function implemented

seed

The random seed

- - -

Examples

-
if (FALSE) { -model.CovNN <- setCovNN() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setCoxModel.html b/docs/reference/setCoxModel.html index 868631ba2..072cb424c 100644 --- a/docs/reference/setCoxModel.html +++ b/docs/reference/setCoxModel.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for lasso Cox model — setCoxModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for lasso Cox model — setCoxModel • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,88 +131,85 @@

Create setting for lasso Cox model

Create setting for lasso Cox model

-
setCoxModel(
-  variance = 0.01,
-  seed = NULL,
-  includeCovariateIds = c(),
-  noShrinkage = c(),
-  threads = -1,
-  upperLimit = 20,
-  lowerLimit = 0.01,
-  tolerance = 2e-07,
-  maxIterations = 3000
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
variance

Numeric: prior distribution starting variance

seed

An option to add a seed when training the model

includeCovariateIds

a set of covariate IDS to limit the analysis to

noShrinkage

a set of covariates whcih are to be forced to be included in the final model. default is the intercept

threads

An option to set number of threads when training model

upperLimit

Numeric: Upper prior variance limit for grid-search

lowerLimit

Numeric: Lower prior variance limit for grid-search

tolerance

Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence

maxIterations

Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error

- - -

Examples

-
model.lr <- setCoxModel() -
+
+
setCoxModel(
+  variance = 0.01,
+  seed = NULL,
+  includeCovariateIds = c(),
+  noShrinkage = c(),
+  threads = -1,
+  upperLimit = 20,
+  lowerLimit = 0.01,
+  tolerance = 2e-07,
+  maxIterations = 3000
+)
+
+ +
+

Arguments

+
variance
+

Numeric: prior distribution starting variance

+ + +
seed
+

An option to add a seed when training the model

+ + +
includeCovariateIds
+

a set of covariate IDS to limit the analysis to

+ + +
noShrinkage
+

a set of covariates whcih are to be forced to be included in the final model. default is the intercept

+ + +
threads
+

An option to set number of threads when training model

+ + +
upperLimit
+

Numeric: Upper prior variance limit for grid-search

+ + +
lowerLimit
+

Numeric: Lower prior variance limit for grid-search

+ + +
tolerance
+

Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence

+ + +
maxIterations
+

Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error

+ +
+ +
+

Examples

+
model.lr <- setCoxModel()
+
+
+
-
- +
- - + + diff --git a/docs/reference/setDecisionTree.html b/docs/reference/setDecisionTree.html index fd8302977..1a6f0906f 100644 --- a/docs/reference/setDecisionTree.html +++ b/docs/reference/setDecisionTree.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for the scikit-learn 1.0.1 DecisionTree with python — setDecisionTree • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for the scikit-learn 1.0.1 DecisionTree with python — setDecisionTree • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,100 +131,97 @@

Create setting for the scikit-learn 1.0.1 DecisionTree with python

Create setting for the scikit-learn 1.0.1 DecisionTree with python

-
setDecisionTree(
-  criterion = list("gini"),
-  splitter = list("best"),
-  maxDepth = list(as.integer(4), as.integer(10), NULL),
-  minSamplesSplit = list(2, 10),
-  minSamplesLeaf = list(10, 50),
-  minWeightFractionLeaf = list(0),
-  maxFeatures = list(100, "auto", NULL),
-  maxLeafNodes = list(NULL),
-  minImpurityDecrease = list(10^-7),
-  classWeight = list(NULL, "balanced"),
-  seed = sample(1e+06, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
criterion

The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

splitter

The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

maxDepth

(list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

minSamplesSplit

The minimum number of samples required to split an internal node

minSamplesLeaf

The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

minWeightFractionLeaf

The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.

maxFeatures

(list) The number of features to consider when looking for the best split (int/'auto'/NULL)

maxLeafNodes

(list) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. (int/NULL)

minImpurityDecrease

Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.

classWeight

(list) Weights associated with classes 'balance' or NULL

seed

The random state seed

- - -

Examples

-
if (FALSE) { -model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL ) -} -
+
+
setDecisionTree(
+  criterion = list("gini"),
+  splitter = list("best"),
+  maxDepth = list(as.integer(4), as.integer(10), NULL),
+  minSamplesSplit = list(2, 10),
+  minSamplesLeaf = list(10, 50),
+  minWeightFractionLeaf = list(0),
+  maxFeatures = list(100, "sqrt", NULL),
+  maxLeafNodes = list(NULL),
+  minImpurityDecrease = list(10^-7),
+  classWeight = list(NULL),
+  seed = sample(1e+06, 1)
+)
+
+ +
+

Arguments

+
criterion
+

The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

+ + +
splitter
+

The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

+ + +
maxDepth
+

(list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

+ + +
minSamplesSplit
+

The minimum number of samples required to split an internal node

+ + +
minSamplesLeaf
+

The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

+ + +
minWeightFractionLeaf
+

The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.

+ + +
maxFeatures
+

(list) The number of features to consider when looking for the best split (int/'sqrt'/NULL)

+ + +
maxLeafNodes
+

(list) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. (int/NULL)

+ + +
minImpurityDecrease
+

Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.

+ + +
classWeight
+

(list) Weights associated with classes 'balance' or NULL

+ + +
seed
+

The random state seed

+ +
+ +
+

Examples

+
if (FALSE) {
+model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL )
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setDeepNN.html b/docs/reference/setDeepNN.html deleted file mode 100644 index d037ae6af..000000000 --- a/docs/reference/setDeepNN.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - -Create setting for DeepNN model — setDeepNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for DeepNN model

-
- -
setDeepNN(
-  units = list(c(128, 64), 128),
-  layer_dropout = c(0.2),
-  lr = c(1e-04),
-  decay = c(1e-05),
-  outcome_weight = c(1),
-  batch_size = c(100),
-  epochs = c(100),
-  seed = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
units

The number of units of the deep network - as a list of vectors

layer_dropout

The layer dropout rate (regularisation)

lr

Learning rate

decay

Learning rate decay over each update.

outcome_weight

The weight of the outcome class in the loss function

batch_size

The number of data points to use per training batch

epochs

Number of times to iterate over dataset

seed

Random seed used by deep learning model

- - -

Examples

-
if (FALSE) { -model <- setDeepNN() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setGBMSurvival.html b/docs/reference/setGBMSurvival.html deleted file mode 100644 index 037065ffb..000000000 --- a/docs/reference/setGBMSurvival.html +++ /dev/null @@ -1,320 +0,0 @@ - - - - - - - - -Create setting for GBM Survival with python -#' @description -This creates a setting for fitting GBM surivial model. You need sksurv python install. To install this open your command line and type: conda install -c sebp scikit-survival — setGBMSurvival • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for GBM Survival with python -#' @description -This creates a setting for fitting GBM surivial model. You need sksurv python install. To install this open your command line and type: conda install -c sebp scikit-survival

-
- -
setGBMSurvival(
-  loss = "coxph",
-  learningRate = 0.1,
-  nEstimators = c(100),
-  criterion = "friedman_mse",
-  minSamplesSplit = 2,
-  minSamplesLeaf = 1,
-  minWeightFractionLeaf = 0,
-  maxDepth = c(3, 10, 17),
-  minImpuritySplit = NULL,
-  minImpurityDecrease = 0,
-  maxFeatures = NULL,
-  maxLeafNodes = NULL,
-  presort = NULL,
-  subsample = 1,
-  dropoutRate = 0,
-  seed = NULL,
-  quiet = F
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
loss

A string specifying the loss function to minimise (default: 'coxph' )

learningRate

A double specifying the learning rate (controls convergence speed)

nEstimators

An integer specifying how many trees to build

criterion

Default: 'friedman_mse'

minSamplesSplit

An integer specifying min samples per tree split (complexity)

minSamplesLeaf

An integer specifying min samples per leaf (complexity)

minWeightFractionLeaf

Lookup

maxDepth

An integer specifying the max depth of trees (complexity)

minImpuritySplit

A double or NULL specifying the minimum impurity split

minImpurityDecrease

will add

maxFeatures

will add

maxLeafNodes

will add

presort

will add

subsample

will add

dropoutRate

will add

seed

will add

quiet

will add

- -

Details

- -

Pick the hyper-parameters you want to do a grid search for

- -

Examples

-
if (FALSE) { -gbmSurv <- setGBMSurvival(learningRate=c(0.1,0.01), nEstimators =c(10,50,100), - maxDepth=c(4,10,17), seed = 2) -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setGradientBoostingMachine.html b/docs/reference/setGradientBoostingMachine.html index e37a071cd..125a8d43b 100644 --- a/docs/reference/setGradientBoostingMachine.html +++ b/docs/reference/setGradientBoostingMachine.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for gradient boosting machine model using gbm_xgboost implementation — setGradientBoostingMachine • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for gradient boosting machine model using gbm_xgboost implementation — setGradientBoostingMachine • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,80 +131,92 @@

Create setting for gradient boosting machine model using gbm_xgboost impleme

Create setting for gradient boosting machine model using gbm_xgboost implementation

-
setGradientBoostingMachine(
-  ntrees = c(100, 1000),
-  nthread = 20,
-  earlyStopRound = 25,
-  maxDepth = c(4, 6, 17),
-  minRows = 2,
-  learnRate = c(0.005, 0.01, 0.1),
-  seed = sample(1e+07, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ntrees

The number of trees to build

nthread

The number of computer threads to (how many cores do you have?)

earlyStopRound

If the performance does not increase over earlyStopRound number of interactions then training stops (this prevents overfitting)

maxDepth

Maximum number of interactions - a large value will lead to slow model training

minRows

The minimum number of rows required at each end node of the tree

learnRate

The boosting learn rate

seed

An option to add a seed when training the final model

- - -

Examples

-
model.gbm <- setGradientBoostingMachine(ntrees=c(10,100), nthread=20, - maxDepth=c(4,6), learnRate=c(0.1,0.3)) - -
+
+
setGradientBoostingMachine(
+  ntrees = c(100, 300),
+  nthread = 20,
+  earlyStopRound = 25,
+  maxDepth = c(4, 6, 8),
+  minChildWeight = 1,
+  learnRate = c(0.05, 0.1, 0.3),
+  scalePosWeight = 1,
+  lambda = 1,
+  alpha = 0,
+  seed = sample(1e+07, 1)
+)
+
+ +
+

Arguments

+
ntrees
+

The number of trees to build

+ + +
nthread
+

The number of computer threads to use (how many cores do you have?)

+ + +
earlyStopRound
+

If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting)

+ + +
maxDepth
+

Maximum depth of each tree - a large value will lead to slow model training

+ + +
minChildWeight
+

Minimum sum of of instance weight in a child node - larger values are more conservative

+ + +
learnRate
+

The boosting learn rate

+ + +
scalePosWeight
+

Controls weight of positive class in loss - useful for imbalanced classes

+ + +
lambda
+

L2 regularization on weights - larger is more conservative

+ + +
alpha
+

L1 regularization on weights - larger is more conservative

+ + +
seed
+

An option to add a seed when training the final model

+ +
+ +
+

Examples

+
model.gbm <- setGradientBoostingMachine(ntrees=c(10,100), nthread=20,
+                           maxDepth=c(4,6), learnRate=c(0.1,0.3))
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/setIterativeHardThresholding.html b/docs/reference/setIterativeHardThresholding.html index 8bd6b1afb..6a234802c 100644 --- a/docs/reference/setIterativeHardThresholding.html +++ b/docs/reference/setIterativeHardThresholding.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for lasso logistic regression — setIterativeHardThresholding • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for lasso logistic regression — setIterativeHardThresholding • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,98 +131,95 @@

Create setting for lasso logistic regression

Create setting for lasso logistic regression

-
setIterativeHardThresholding(
-  K = 10,
-  penalty = "bic",
-  seed = sample(1e+05, 1),
-  exclude = c(),
-  forceIntercept = F,
-  fitBestSubset = FALSE,
-  initialRidgeVariance = 10000,
-  tolerance = 1e-08,
-  maxIterations = 10000,
-  threshold = 1e-06,
-  delta = 0
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
K

The maximum number of non-zero predictors

penalty

Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value

seed

An option to add a seed when training the model

exclude

A vector of numbers or covariateId names to exclude from prior

forceIntercept

Logical: Force intercept coefficient into regularization

fitBestSubset

Logical: Fit final subset with no regularization

initialRidgeVariance

integer

tolerance

numeric

maxIterations

integer

threshold

numeric

delta

numeric

- - -

Examples

-
+
+
setIterativeHardThresholding(
+  K = 10,
+  penalty = "bic",
+  seed = sample(1e+05, 1),
+  exclude = c(),
+  forceIntercept = F,
+  fitBestSubset = FALSE,
+  initialRidgeVariance = 10000,
+  tolerance = 1e-08,
+  maxIterations = 10000,
+  threshold = 1e-06,
+  delta = 0
+)
+
+ +
+

Arguments

+
K
+

The maximum number of non-zero predictors

+ + +
penalty
+

Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value

+ + +
seed
+

An option to add a seed when training the model

+ + +
exclude
+

A vector of numbers or covariateId names to exclude from prior

+ + +
forceIntercept
+

Logical: Force intercept coefficient into regularization

+ + +
fitBestSubset
+

Logical: Fit final subset with no regularization

+ + +
initialRidgeVariance
+

integer

+ + +
tolerance
+

numeric

+ + +
maxIterations
+

integer

+ + +
threshold
+

numeric

+ + +
delta
+

numeric

+ +
+ +
+

Examples

+ +
+
-
- +
- - + + diff --git a/docs/reference/setKNN.html b/docs/reference/setKNN.html index 8d8a8c2ca..e3025432b 100644 --- a/docs/reference/setKNN.html +++ b/docs/reference/setKNN.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for knn model — setKNN • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for knn model — setKNN • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,56 +131,53 @@

Create setting for knn model

Create setting for knn model

-
setKNN(k = 1000, indexFolder = file.path(getwd(), "knn"), threads = 1)
- -

Arguments

- - - - - - - - - - - - - - -
k

The number of neighbors to consider

indexFolder

The directory where the results and intermediate steps are output

threads

The number of threads to use when applying big knn

- - -

Examples

-
if (FALSE) { -model.knn <- setKNN(k=10000) -} -
+
+
setKNN(k = 1000, indexFolder = file.path(getwd(), "knn"), threads = 1)
+
+ +
+

Arguments

+
k
+

The number of neighbors to consider

+ + +
indexFolder
+

The directory where the results and intermediate steps are output

+ + +
threads
+

The number of threads to use when applying big knn

+ +
+ +
+

Examples

+
if (FALSE) {
+model.knn <- setKNN(k=10000)
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setLRTorch.html b/docs/reference/setLRTorch.html deleted file mode 100644 index 0ea3d5ac4..000000000 --- a/docs/reference/setLRTorch.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - -Create setting for logistics regression model with python — setLRTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for logistics regression model with python

-
- -
setLRTorch(
-  w_decay = c(5e-04, 0.005),
-  epochs = c(20, 50, 100),
-  seed = NULL,
-  class_weight = 0,
-  autoencoder = FALSE,
-  vae = FALSE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
w_decay

The l2 regularisation

epochs

The number of epochs

seed

A seed for the model

class_weight

The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

autoencoder

First learn stakced autoencoder for input features, then train LR on the encoded features.

vae

First learn stakced varational autoencoder for input features, then train LR on the encoded features.

- - -

Examples

-
if (FALSE) { -model.lrTorch <- setLRTorch() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setLassoLogisticRegression.html b/docs/reference/setLassoLogisticRegression.html index aaccccd2e..05f3b0a0b 100644 --- a/docs/reference/setLassoLogisticRegression.html +++ b/docs/reference/setLassoLogisticRegression.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for lasso logistic regression — setLassoLogisticRegression • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for lasso logistic regression — setLassoLogisticRegression • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,93 +131,95 @@

Create setting for lasso logistic regression

Create setting for lasso logistic regression

-
setLassoLogisticRegression(
-  variance = 0.01,
-  seed = NULL,
-  includeCovariateIds = c(),
-  noShrinkage = c(0),
-  threads = -1,
-  forceIntercept = F,
-  upperLimit = 20,
-  lowerLimit = 0.01,
-  tolerance = 2e-06,
-  maxIterations = 3000
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
variance

Numeric: prior distribution starting variance

seed

An option to add a seed when training the model

includeCovariateIds

a set of covariate IDS to limit the analysis to

noShrinkage

a set of covariates whcih are to be forced to be included in the final model. default is the intercept

threads

An option to set number of threads when training model

forceIntercept

Logical: Force intercept coefficient into prior

upperLimit

Numeric: Upper prior variance limit for grid-search

lowerLimit

Numeric: Lower prior variance limit for grid-search

tolerance

Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence

maxIterations

Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error

- - -

Examples

-
model.lr <- setLassoLogisticRegression() -
+
+
setLassoLogisticRegression(
+  variance = 0.01,
+  seed = NULL,
+  includeCovariateIds = c(),
+  noShrinkage = c(0),
+  threads = -1,
+  forceIntercept = F,
+  upperLimit = 20,
+  lowerLimit = 0.01,
+  tolerance = 2e-06,
+  maxIterations = 3000,
+  priorCoefs = NULL
+)
+
+ +
+

Arguments

+
variance
+

Numeric: prior distribution starting variance

+ + +
seed
+

An option to add a seed when training the model

+ + +
includeCovariateIds
+

a set of covariate IDS to limit the analysis to

+ + +
noShrinkage
+

a set of covariates whcih are to be forced to be included in the final model. default is the intercept

+ + +
threads
+

An option to set number of threads when training model

+ + +
forceIntercept
+

Logical: Force intercept coefficient into prior

+ + +
upperLimit
+

Numeric: Upper prior variance limit for grid-search

+ + +
lowerLimit
+

Numeric: Lower prior variance limit for grid-search

+ + +
tolerance
+

Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence

+ + +
maxIterations
+

Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error

+ + +
priorCoefs
+

Use coefficients from a previous model as starting points for model fit (transfer learning)

+ +
+ +
+

Examples

+
model.lr <- setLassoLogisticRegression()
+
+
+
-
- +
- - + + diff --git a/docs/reference/setLightGBM.html b/docs/reference/setLightGBM.html new file mode 100644 index 000000000..d15720ee7 --- /dev/null +++ b/docs/reference/setLightGBM.html @@ -0,0 +1,234 @@ + +Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package). — setLightGBM • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package).

+
+ +
+
setLightGBM(
+  nthread = 20,
+  earlyStopRound = 25,
+  numIterations = c(100),
+  numLeaves = c(31),
+  maxDepth = c(5, 10),
+  minDataInLeaf = c(20),
+  learningRate = c(0.05, 0.1, 0.3),
+  lambdaL1 = c(0),
+  lambdaL2 = c(0),
+  scalePosWeight = 1,
+  isUnbalance = FALSE,
+  seed = sample(1e+07, 1)
+)
+
+ +
+

Arguments

+
nthread
+

The number of computer threads to use (how many cores do you have?)

+ + +
earlyStopRound
+

If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting)

+ + +
numIterations
+

Number of boosting iterations.

+ + +
numLeaves
+

This hyperparameter sets the maximum number of leaves. Increasing this parameter can lead to higher model complexity and potential overfitting.

+ + +
maxDepth
+

This hyperparameter sets the maximum depth . Increasing this parameter can also lead to higher model complexity and potential overfitting.

+ + +
minDataInLeaf
+

This hyperparameter sets the minimum number of data points that must be present in a leaf node. Increasing this parameter can help to reduce overfitting

+ + +
learningRate
+

This hyperparameter controls the step size at each iteration of the gradient descent algorithm. Lower values can lead to slower convergence but may result in better performance.

+ + +
lambdaL1
+

This hyperparameter controls L1 regularization, which can help to reduce overfitting by encouraging sparse models.

+ + +
lambdaL2
+

This hyperparameter controls L2 regularization, which can also help to reduce overfitting by discouraging large weights in the model.

+ + +
scalePosWeight
+

Controls weight of positive class in loss - useful for imbalanced classes

+ + +
isUnbalance
+

This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities.

+ + +
seed
+

An option to add a seed when training the final model

+ +
+ +
+

Examples

+
model.lightgbm <- setLightGBM(
+    numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10),
+    minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3)
+)
+
+
+
+
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/setMLP.html b/docs/reference/setMLP.html index 09cb9029f..f7f77fb07 100644 --- a/docs/reference/setMLP.html +++ b/docs/reference/setMLP.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for neural network model with python — setMLP • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for neural network model with python — setMLP • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,155 +131,151 @@

Create setting for neural network model with python

Create setting for neural network model with python

-
setMLP(
-  hiddenLayerSizes = list(c(100), c(20, 4)),
-  activation = list("relu"),
-  solver = list("adam"),
-  alpha = list(0.3, 0.01, 1e-04, 1e-06),
-  batchSize = list("auto"),
-  learningRate = list("constant"),
-  learningRateInit = list(0.001),
-  powerT = list(0.5),
-  maxIter = list(200, 100),
-  shuffle = list(TRUE),
-  tol = list(1e-04),
-  warmStart = list(TRUE),
-  momentum = list(0.9),
-  nesterovsMomentum = list(TRUE),
-  earlyStopping = list(FALSE),
-  validationFraction = list(0.1),
-  beta1 = list(0.9),
-  beta2 = list(0.999),
-  epsilon = list(1, 0.1, 1e-08),
-  nIterNoChange = list(10),
-  seed = sample(1e+05, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
hiddenLayerSizes

(list of vectors) The ith element represents the number of neurons in the ith hidden layer.

activation

(list) Activation function for the hidden layer.

    -
  • "identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x

  • +
    +
    setMLP(
    +  hiddenLayerSizes = list(c(100), c(20)),
    +  activation = list("relu"),
    +  solver = list("adam"),
    +  alpha = list(0.3, 0.01, 1e-04, 1e-06),
    +  batchSize = list("auto"),
    +  learningRate = list("constant"),
    +  learningRateInit = list(0.001),
    +  powerT = list(0.5),
    +  maxIter = list(200, 100),
    +  shuffle = list(TRUE),
    +  tol = list(1e-04),
    +  warmStart = list(TRUE),
    +  momentum = list(0.9),
    +  nesterovsMomentum = list(TRUE),
    +  earlyStopping = list(FALSE),
    +  validationFraction = list(0.1),
    +  beta1 = list(0.9),
    +  beta2 = list(0.999),
    +  epsilon = list(1e-08),
    +  nIterNoChange = list(10),
    +  seed = sample(1e+05, 1)
    +)
    +
    + +
    +

    Arguments

    +
    hiddenLayerSizes
    +

    (list of vectors) The ith element represents the number of neurons in the ith hidden layer.

    + + +
    activation
    +

    (list) Activation function for the hidden layer.

    • "identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x

    • "logistic": the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).

    • "tanh": the hyperbolic tan function, returns f(x) = tanh(x).

    • "relu": the rectified linear unit function, returns f(x) = max(0, x)

    • -
solver

(list) The solver for weight optimization. (‘lbfgs’, ‘sgd’, ‘adam’)

alpha

(list) L2 penalty (regularization term) parameter.

batchSize

(list) Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch. When set to “auto”, batchSize=min(200, n_samples).

learningRate

(list) Only used when solver='sgd' Learning rate schedule for weight updates.‘constant’, ‘invscaling’, ‘adaptive’, default=’constant’

learningRateInit

(list) Only used when solver=’sgd’ or ‘adam’. The initial learning rate used. It controls the step-size in updating the weights.

powerT

(list) Only used when solver=’sgd’. The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’.

maxIter

(list) Maximum number of iterations. The solver iterates until convergence (determined by ‘tol’) or this number of iterations. For stochastic solvers (‘sgd’, ‘adam’), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.

shuffle

(list) boolean: Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.

tol

(list) Tolerance for the optimization. When the loss or score is not improving by at least tol for nIterNoChange consecutive iterations, unless learning_rate is set to ‘adaptive’, convergence is considered to be reached and training stops.

warmStart

(list) When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.

momentum

(list) Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.

nesterovsMomentum

(list) Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.

earlyStopping

(list) boolean Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically set aside 10 percent of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.

validationFraction

(list) The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if earlyStopping is True.

beta1

(list) Exponential decay rate for estimates of first moment vector in adam, should be in 0 to 1.

beta2

(list) Exponential decay rate for estimates of second moment vector in adam, should be in 0 to 1.

epsilon

(list) Value for numerical stability in adam.

nIterNoChange

(list) Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.

seed

A seed for the model

- - -

Examples

-
if (FALSE) { -model.mlp <- setMLP() -} -
+ + + +
solver
+

(list) The solver for weight optimization. (‘lbfgs’, ‘sgd’, ‘adam’)

+ + +
alpha
+

(list) L2 penalty (regularization term) parameter.

+ + +
batchSize
+

(list) Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch. When set to “auto”, batchSize=min(200, n_samples).

+ + +
learningRate
+

(list) Only used when solver='sgd' Learning rate schedule for weight updates. ‘constant’, ‘invscaling’, ‘adaptive’, default=’constant’

+ + +
learningRateInit
+

(list) Only used when solver=’sgd’ or ‘adam’. The initial learning rate used. It controls the step-size in updating the weights.

+ + +
powerT
+

(list) Only used when solver=’sgd’. The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’.

+ + +
maxIter
+

(list) Maximum number of iterations. The solver iterates until convergence (determined by ‘tol’) or this number of iterations. For stochastic solvers (‘sgd’, ‘adam’), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.

+ + +
shuffle
+

(list) boolean: Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.

+ + +
tol
+

(list) Tolerance for the optimization. When the loss or score is not improving by at least tol for nIterNoChange consecutive iterations, unless learning_rate is set to ‘adaptive’, convergence is considered to be reached and training stops.

+ + +
warmStart
+

(list) When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.

+ + +
momentum
+

(list) Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.

+ + +
nesterovsMomentum
+

(list) Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.

+ + +
earlyStopping
+

(list) boolean Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically set aside 10 percent of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.

+ + +
validationFraction
+

(list) The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if earlyStopping is True.

+ + +
beta1
+

(list) Exponential decay rate for estimates of first moment vector in adam, should be in 0 to 1.

+ + +
beta2
+

(list) Exponential decay rate for estimates of second moment vector in adam, should be in 0 to 1.

+ + +
epsilon
+

(list) Value for numerical stability in adam.

+ + +
nIterNoChange
+

(list) Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.

+ + +
seed
+

A seed for the model

+ +
+ +
+

Examples

+
if (FALSE) {
+model.mlp <- setMLP()
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setMLPTorch.html b/docs/reference/setMLPTorch.html deleted file mode 100644 index e12be4bc0..000000000 --- a/docs/reference/setMLPTorch.html +++ /dev/null @@ -1,263 +0,0 @@ - - - - - - - - -Create setting for neural network model with python — setMLPTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for neural network model with python

-
- -
setMLPTorch(
-  size = c(500, 1000),
-  w_decay = c(5e-04, 0.005),
-  epochs = c(20, 50),
-  seed = 0,
-  class_weight = 0,
-  mlp_type = "MLP",
-  autoencoder = FALSE,
-  vae = FALSE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
size

The number of hidden nodes

w_decay

The l2 regularisation

epochs

The number of epochs

seed

A seed for the model

class_weight

The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

mlp_type

The type of multiple layer network, inlcuding MLP and SNN (self-normalizing neural network)

autoencoder

First learn stakced autoencoder for input features, then train MLP on the encoded features.

vae

First learn stakced varational autoencoder for input features, then train MLP on the encoded features.

- - -

Examples

-
if (FALSE) { -model.mlpTorch <- setMLPTorch() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setNaiveBayes.html b/docs/reference/setNaiveBayes.html index 51cb57879..a0389c1d8 100644 --- a/docs/reference/setNaiveBayes.html +++ b/docs/reference/setNaiveBayes.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for naive bayes model with python — setNaiveBayes • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for naive bayes model with python — setNaiveBayes • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,40 +131,39 @@

Create setting for naive bayes model with python

Create setting for naive bayes model with python

-
setNaiveBayes()
- +
+
setNaiveBayes()
+
-

Examples

-
if (FALSE) { -model.nb <- setNaiveBayes() -} -
+
+

Examples

+
if (FALSE) {
+model.nb <- setNaiveBayes()
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setPythonEnvironment.html b/docs/reference/setPythonEnvironment.html index 3d8362dca..ce517c7b7 100644 --- a/docs/reference/setPythonEnvironment.html +++ b/docs/reference/setPythonEnvironment.html @@ -1,67 +1,12 @@ - - - - - - - -Use the virtual environment created using configurePython() — setPythonEnvironment • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Use the virtual environment created using configurePython() — setPythonEnvironment • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,50 +131,46 @@

Use the virtual environment created using configurePython()

Use the virtual environment created using configurePython()

-
setPythonEnvironment(envname = "PLP", envtype = NULL)
+
+
setPythonEnvironment(envname = "PLP", envtype = NULL)
+
+ +
+

Arguments

+
envname
+

A string for the name of the virtual environment (default is 'PLP')

-

Arguments

- - - - - - - - - - -
envname

A string for the name of the virtual environment (default is 'PLP')

envtype

An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users

-

Details

+
envtype
+

An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users

+
+
+

Details

This function sets PatientLevelPrediction to use a virtual environment

+
+
-
- +
- - + + diff --git a/docs/reference/setRNNTorch.html b/docs/reference/setRNNTorch.html deleted file mode 100644 index 96663666f..000000000 --- a/docs/reference/setRNNTorch.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - -Create setting for RNN model with python — setRNNTorch • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for RNN model with python

-
- -
setRNNTorch(
-  hidden_size = c(50, 100),
-  epochs = c(20, 50),
-  seed = 0,
-  class_weight = 0,
-  type = "RNN"
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
hidden_size

The hidden size

epochs

The number of epochs

seed

A seed for the model

class_weight

The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

type

It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'

- - -

Examples

-
if (FALSE) { -model.rnnTorch <- setRNNTorch() -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setRNNTorch2.html b/docs/reference/setRNNTorch2.html deleted file mode 100644 index 8193cd04e..000000000 --- a/docs/reference/setRNNTorch2.html +++ /dev/null @@ -1,226 +0,0 @@ - - - - - - - - -Create setting for RNN model with python — setRNNTorch2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Create setting for RNN model with python

- -
- -
setRNNTorch2(hidden_size = c(50, 100), epochs = c(20, 50), seed = 0,
-  class_weight = 0, type = "RNN")
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
hidden_size

The hidden size

epochs

The number of epochs

seed

A seed for the model

class_weight

The class weight used for imbalanced data: - 0: Inverse ratio between positives and negatives --1: Focal loss

type

It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'

- - -

Examples

-
# NOT RUN {
-model.rnnTorch <- setRNNTorch()
-# }
-
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/setRandomForest.html b/docs/reference/setRandomForest.html index d45605d22..1b44fefc4 100644 --- a/docs/reference/setRandomForest.html +++ b/docs/reference/setRandomForest.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for random forest model with python (very fast) — setRandomForest • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for random forest model with python (very fast) — setRandomForest • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,128 +131,123 @@

Create setting for random forest model with python (very fast)

Create setting for random forest model with python (very fast)

-
setRandomForest(
-  ntrees = list(100, 500),
-  criterion = list("gini"),
-  maxDepth = list(4, 10, 17),
-  minSamplesSplit = list(2, 5),
-  minSamplesLeaf = list(1, 10),
-  minWeightFractionLeaf = list(0),
-  mtries = list("auto", "log2"),
-  maxLeafNodes = list(NULL),
-  minImpurityDecrease = list(0),
-  bootstrap = list(TRUE),
-  maxSamples = list(NULL, 0.9),
-  oobScore = list(FALSE),
-  nJobs = list(NULL),
-  classWeight = list("balanced_subsample", NULL),
-  seed = sample(1e+05, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ntrees

(list) The number of trees to build

criterion

(list) The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

maxDepth

(list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than minSamplesSplit samples.

minSamplesSplit

(list) The minimum number of samples required to split an internal node

minSamplesLeaf

(list) The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

minWeightFractionLeaf

(list) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.

mtries

(list) The number of features to consider when looking for the best split:

    -
  • intthen consider max_features features at each split.

  • -
  • floatthen max_features is a fraction and round(max_features * n_features) features are considered at each split

  • -
  • 'auto'then max_features=sqrt(n_features)

  • -
  • 'sqrt'then max_features=sqrt(n_features) (same as “auto”)

  • -
  • 'log2'then max_features=log2(n_features).

  • -
  • NULLthen max_features=n_features

  • -
maxLeafNodes

(list) Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

minImpurityDecrease

(list) A node will be split if this split induces a decrease of the impurity greater than or equal to this value.

bootstrap

(list) Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

maxSamples

(list) If bootstrap is True, the number of samples to draw from X to train each base estimator.

oobScore

(list) Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True.

nJobs

The number of jobs to run in parallel.

classWeight

(list) Weights associated with classes. If not given, all classes are supposed to have weight one. NULL, “balanced”, “balanced_subsample”

seed

A seed when training the final model

- - -

Examples

-
if (FALSE) { -model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), - maxDepth=c(5,20)) -} -
+
+
setRandomForest(
+  ntrees = list(100, 500),
+  criterion = list("gini"),
+  maxDepth = list(4, 10, 17),
+  minSamplesSplit = list(2, 5),
+  minSamplesLeaf = list(1, 10),
+  minWeightFractionLeaf = list(0),
+  mtries = list("sqrt", "log2"),
+  maxLeafNodes = list(NULL),
+  minImpurityDecrease = list(0),
+  bootstrap = list(TRUE),
+  maxSamples = list(NULL, 0.9),
+  oobScore = list(FALSE),
+  nJobs = list(NULL),
+  classWeight = list(NULL),
+  seed = sample(1e+05, 1)
+)
+
+ +
+

Arguments

+
ntrees
+

(list) The number of trees to build

+ + +
criterion
+

(list) The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

+ + +
maxDepth
+

(list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than minSamplesSplit samples.

+ + +
minSamplesSplit
+

(list) The minimum number of samples required to split an internal node

+ + +
minSamplesLeaf
+

(list) The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least minSamplesLeaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

+ + +
minWeightFractionLeaf
+

(list) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.

+ + +
mtries
+

(list) The number of features to consider when looking for the best split:

  • int then consider max_features features at each split.

  • +
  • float then max_features is a fraction and round(max_features * n_features) features are considered at each split

  • +
  • 'sqrt' then max_features=sqrt(n_features)

  • +
  • 'log2' then max_features=log2(n_features)

  • +
  • NULL then max_features=n_features

  • +
+ + +
maxLeafNodes
+

(list) Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

+ + +
minImpurityDecrease
+

(list) A node will be split if this split induces a decrease of the impurity greater than or equal to this value.

+ + +
bootstrap
+

(list) Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

+ + +
maxSamples
+

(list) If bootstrap is True, the number of samples to draw from X to train each base estimator.

+ + +
oobScore
+

(list) Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True.

+ + +
nJobs
+

The number of jobs to run in parallel.

+ + +
classWeight
+

(list) Weights associated with classes. If not given, all classes are supposed to have weight one. NULL, “balanced”, “balanced_subsample”

+ + +
seed
+

A seed when training the final model

+ +
+ +
+

Examples

+
if (FALSE) {
+model.rf <- setRandomForest(mtries=list('auto',5,20),  ntrees=c(10,100),
+                           maxDepth=c(5,20))
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setRandomForestQuantileRegressor.html b/docs/reference/setRandomForestQuantileRegressor.html deleted file mode 100644 index c01acfc7c..000000000 --- a/docs/reference/setRandomForestQuantileRegressor.html +++ /dev/null @@ -1,300 +0,0 @@ - - - - - - - - -Create setting for RandomForestQuantileRegressor with python scikit-garden (skgarden.quantile.RandomForestQuantileRegressor) -#' @description -This creates a setting for fitting a RandomForestQuantileRegressor model. You need skgarden python install. To install this open your command line and type: conda install -c conda-forge scikit-garden — setRandomForestQuantileRegressor • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for RandomForestQuantileRegressor with python scikit-garden (skgarden.quantile.RandomForestQuantileRegressor) -#' @description -This creates a setting for fitting a RandomForestQuantileRegressor model. You need skgarden python install. To install this open your command line and type: conda install -c conda-forge scikit-garden

-
- -
setRandomForestQuantileRegressor(
-  nEstimators = c(100),
-  criterion = "mse",
-  maxFeatures = -1,
-  maxDepth = 4,
-  minSamplesSplit = 2,
-  minSamplesLeaf = 1,
-  minWeightFractionLeaf = 0,
-  maxLeafNodes = NULL,
-  bootstrap = TRUE,
-  oobScore = FALSE,
-  warmStart = FALSE,
-  seed = NULL,
-  quiet = F
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nEstimators

(int default:100) The number of trees in the forest.

criterion

(string default="mse")) The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error.

maxFeatures

(int default: -1) The number of features to consider when looking for the best split. If -1 then use sqrt of total number of features.

maxDepth

(int default:4) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than minSamplesSplit samples.

minSamplesSplit

An integer specifying min samples per tree split (complexity)

minSamplesLeaf

An integer specifying min samples per leaf (complexity)

minWeightFractionLeaf

Lookup

maxLeafNodes

(int) Grow trees with maxLeafNodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

bootstrap

(boolean default:TRUE) Whether bootstrap samples are used when building trees.

oobScore

(boolean default:FALSE) Whether to use out-of-bag samples to estimate the R^2 on unseen data.

warmStart

(boolean default:FALSE) When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.

seed

will add

quiet

will add

- -

Details

- -

Pick the hyper-parameters you want to do a grid search for

- -

Examples

-
if (FALSE) { -rfQR <- setRandomForestQuantileRegressor(nEstimators =c(10,50,100), - maxDepth=c(4,10,17), seed = 2) -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/setSVM.html b/docs/reference/setSVM.html index 7c2c3a575..c8ca9a9e7 100644 --- a/docs/reference/setSVM.html +++ b/docs/reference/setSVM.html @@ -1,67 +1,12 @@ - - - - - - - -Create setting for the python sklearn SVM (SVC function) — setSVM • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create setting for the python sklearn SVM (SVC function) — setSVM • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,95 +131,92 @@

Create setting for the python sklearn SVM (SVC function)

Create setting for the python sklearn SVM (SVC function)

-
setSVM(
-  C = list(1, 0.9, 2, 0.1),
-  kernel = list("rbf"),
-  degree = list(1, 3, 5),
-  gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25),
-  coef0 = list(0),
-  shrinking = list(TRUE),
-  tol = list(0.001),
-  classWeight = list("balanced", NULL),
-  cacheSize = 500,
-  seed = sample(1e+05, 1)
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
C

(list) Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.

kernel

(list) Specifies the kernel type to be used in the algorithm. one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’. If none is given ‘rbf’ will be used.

degree

(list) degree of kernel function is significant only in poly, rbf, sigmoid

gamma

(list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. ‘scale’, ‘auto’ or float, default=’scale’

coef0

(list) independent term in kernel function. It is only significant in poly/sigmoid.

shrinking

(list) whether to use the shrinking heuristic.

tol

(list) Tolerance for stopping criterion.

classWeight

(list) Class weight based on imbalance either 'balanced' or NULL

cacheSize

Specify the size of the kernel cache (in MB).

seed

A seed for the model

- - -

Examples

-
if (FALSE) { -model.svm <- setSVM(kernel='rbf', seed = NULL) -} -
+
+
setSVM(
+  C = list(1, 0.9, 2, 0.1),
+  kernel = list("rbf"),
+  degree = list(1, 3, 5),
+  gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25),
+  coef0 = list(0),
+  shrinking = list(TRUE),
+  tol = list(0.001),
+  classWeight = list(NULL),
+  cacheSize = 500,
+  seed = sample(1e+05, 1)
+)
+
+ +
+

Arguments

+
C
+

(list) Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.

+ + +
kernel
+

(list) Specifies the kernel type to be used in the algorithm. one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’. If none is given ‘rbf’ will be used.

+ + +
degree
+

(list) degree of kernel function is significant only in poly, rbf, sigmoid

+ + +
gamma
+

(list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. ‘scale’, ‘auto’ or float, default=’scale’

+ + +
coef0
+

(list) independent term in kernel function. It is only significant in poly/sigmoid.

+ + +
shrinking
+

(list) whether to use the shrinking heuristic.

+ + +
tol
+

(list) Tolerance for stopping criterion.

+ + +
classWeight
+

(list) Class weight based on imbalance either 'balanced' or NULL

+ + +
cacheSize
+

Specify the size of the kernel cache (in MB).

+ + +
seed
+

A seed for the model

+ +
+ +
+

Examples

+
if (FALSE) {
+model.svm <- setSVM(kernel='rbf', seed = NULL)
+}
+
+
+
-
- +
- - + + diff --git a/docs/reference/setSagemakerBinary.html b/docs/reference/setSagemakerBinary.html deleted file mode 100644 index a0d5969de..000000000 --- a/docs/reference/setSagemakerBinary.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - -Create setting for sagemaker model — setSagemakerBinary • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Create setting for sagemaker model

-
- -
setSagemakerBinary(
-  classifier = "xgboost",
-  bucket,
-  prefix = "data",
-  roleArn,
-  otherparams = NULL,
-  seed = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
classifier

The name of the sagemaker binary classifier to use (pick from: knn, xgboost or linear-learner)

bucket

The s3 bucker string to save data for model training

prefix

The s3 subdirectory for the data

roleArn

The amazon roleArn

otherparams

Other parameters for training (currently not working)

seed

The seed for the training

- - -

Examples

-
if (FALSE) { -model.sm <- setSagemakerBinary(classifier='gxboost', bucket='ohdsi3') -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/similarPlpData.html b/docs/reference/similarPlpData.html deleted file mode 100644 index c7640b066..000000000 --- a/docs/reference/similarPlpData.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - - -Extract new plpData using plpModel settings -use metadata in plpModel to extract similar data and population for new databases: — similarPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Extract new plpData using plpModel settings -use metadata in plpModel to extract similar data and population for new databases:

-
- -
similarPlpData(
-  plpModel = NULL,
-  newConnectionDetails,
-  newCdmDatabaseSchema = NULL,
-  newCohortDatabaseSchema = NULL,
-  newCohortTable = NULL,
-  newCohortId = NULL,
-  newOutcomeDatabaseSchema = NULL,
-  newOutcomeTable = NULL,
-  newOutcomeId = NULL,
-  newOracleTempSchema = newCdmDatabaseSchema,
-  sample = NULL,
-  createPopulation = T,
-  createCohorts = T
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpModel

The trained PatientLevelPrediction model or object returned by runPlp()

newConnectionDetails

The connectionDetails for the new database

newCdmDatabaseSchema

The database schema for the new CDM database

newCohortDatabaseSchema

The database schema where the cohort table is stored

newCohortTable

The table name of the cohort table

newCohortId

The cohort_definition_id for the cohort of at risk people

newOutcomeDatabaseSchema

The database schema where the outcome table is stored

newOutcomeTable

The table name of the outcome table

newOutcomeId

The cohort_definition_id for the outcome

newOracleTempSchema

The temp coracle schema

sample

The number of people to sample (default is NULL meaning use all data)

createPopulation

Whether to create the study population as well

createCohorts

No longer used

- - -

Examples

-
if (FALSE) { -# set the connection -connectionDetails <- DatabaseConnector::createConnectionDetails() - -# load the model and data -plpModel <- loadPlpModel("C:/plpmodel") - -# extract the new data in the 'newData.dbo' schema using the model settings -newDataList <- similarPlpData(plpModel=plpModel, - newConnectionDetails = connectionDetails, - newCdmDatabaseSchema = 'newData.dbo', - newCohortDatabaseSchema = 'newData.dbo', - newCohortTable = 'cohort', - newCohortId = 1, - newOutcomeDatabaseSchema = 'newData.dbo', - newOutcomeTable = 'outcome', - newOutcomeId = 2) - -# get the prediction: -prediction <- applyModel(newDataList$population, newDataList$plpData, plpModel)$prediction -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/simulatePlpData.html b/docs/reference/simulatePlpData.html index 505986878..28d042f5a 100644 --- a/docs/reference/simulatePlpData.html +++ b/docs/reference/simulatePlpData.html @@ -1,67 +1,12 @@ - - - - - - - -Generate simulated data — simulatePlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Generate simulated data — simulatePlpData • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,56 +131,55 @@

Generate simulated data

simulateplpData creates a plpData object with simulated data.

-
simulatePlpData(plpDataSimulationProfile, n = 10000)
+
+
simulatePlpData(plpDataSimulationProfile, n = 10000)
+
-

Arguments

- - - - - - - - - - -
plpDataSimulationProfile

An object of type plpDataSimulationProfile as generated -using the
createplpDataSimulationProfile function.

n

The size of the population to be generated.

+
+

Arguments

+
plpDataSimulationProfile
+

An object of type plpDataSimulationProfile as generated +using the
createplpDataSimulationProfile function.

-

Value

-

An object of type plpData.

-

Details

+
n
+

The size of the population to be generated.

+
+
+

Value

+ + +

An object of type plpData.

+
+
+

Details

This function generates simulated data that is in many ways similar to the original data on which the simulation profile is based. The contains same outcome, comparator, and outcome concept IDs, and the covariates and their 1st order statistics should be comparable.

+
+
-
- +
- - + + diff --git a/docs/reference/sklearnFromJson.html b/docs/reference/sklearnFromJson.html new file mode 100644 index 000000000..207925bfd --- /dev/null +++ b/docs/reference/sklearnFromJson.html @@ -0,0 +1,168 @@ + +Loads sklearn python model from json — sklearnFromJson • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Loads sklearn python model from json

+
+ +
+
sklearnFromJson(path)
+
+ +
+

Arguments

+
path
+

path to the model json file

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/sklearnToJson.html b/docs/reference/sklearnToJson.html new file mode 100644 index 000000000..4f18ac540 --- /dev/null +++ b/docs/reference/sklearnToJson.html @@ -0,0 +1,172 @@ + +Saves sklearn python model object to json in path — sklearnToJson • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

Saves sklearn python model object to json in path

+
+ +
+
sklearnToJson(model, path)
+
+ +
+

Arguments

+
model
+

a fitted sklearn python model object

+ + +
path
+

path to the saved model file

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/specificity.html b/docs/reference/specificity.html index 6f582daef..e16bf46a6 100644 --- a/docs/reference/specificity.html +++ b/docs/reference/specificity.html @@ -1,67 +1,12 @@ - - - - - - - -Calculate the specificity — specificity • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Calculate the specificity — specificity • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,61 +131,60 @@

Calculate the specificity

Calculate the specificity

-
specificity(TP, TN, FN, FP)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
TP

Number of true positives

TN

Number of true negatives

FN

Number of false negatives

FP

Number of false positives

- -

Value

- -

specificity value

-

Details

+
+
specificity(TP, TN, FN, FP)
+
+
+

Arguments

+
TP
+

Number of true positives

+ + +
TN
+

Number of true negatives

+ + +
FN
+

Number of false negatives

+ + +
FP
+

Number of false positives

+ +
+
+

Value

+ + +

specificity value

+
+
+

Details

Calculate the specificity

+
+
-
- +
- - + + diff --git a/docs/reference/splitData.html b/docs/reference/splitData.html index f4a91d37d..1b6baefdf 100644 --- a/docs/reference/splitData.html +++ b/docs/reference/splitData.html @@ -1,67 +1,12 @@ - - - - - - - -Split the plpData into test/train sets using a splitting settings of class splitSettings — splitData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Split the plpData into test/train sets using a splitting settings of class splitSettings — splitData • PatientLevelPrediction + + - - - - -
-
- -
- -
+
@@ -183,70 +131,67 @@

Split the plpData into test/train sets using a splitting settings of class <

Split the plpData into test/train sets using a splitting settings of class splitSettings

-
splitData(
-  plpData = plpData,
-  population = population,
-  splitSettings = splitSettings
-)
- -

Arguments

- - - - - - - - - - - - - - -
plpData

An object of type plpData - the patient level prediction -data extracted from the CDM.

population

The population created using createStudyPopulation that define who will be used to develop the model

splitSettings

An object of type splitSettings specifying the split - the default can be created using createDefaultSplitSetting

- -

Value

- -

An object of class splitSettings

-

Details

- -

Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing

    -
  • covariateRef a table with the covariate information

  • -
  • labels) a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label)

  • -
  • folds a table (rowId, index) specifying which training fold each data point is in.

  • -

Test is an Andromeda object containing

    -
  • covariateRef a table with the covariate information

  • -
  • labels) a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label)

  • -
+
+
splitData(
+  plpData = plpData,
+  population = population,
+  splitSettings = splitSettings
+)
+
+ +
+

Arguments

+
plpData
+

An object of type plpData - the patient level prediction +data extracted from the CDM.

+ + +
population
+

The population created using createStudyPopulation that define who will be used to develop the model

+ + +
splitSettings
+

An object of type splitSettings specifying the split - the default can be created using createDefaultSplitSetting

+
+
+

Value

+ + +

An object of class splitSettings

+ + +
+
+

Details

+

Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing

  • covariateRef: a table with the covariate information

  • +
  • labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label)

  • +
  • folds: a table (rowId, index) specifying which training fold each data point is in.

  • +

Test is an Andromeda object containing

  • covariateRef: a table with the covariate information

  • +
  • labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label)

  • +
+
-
- +
- - + + diff --git a/docs/reference/subjectSplitter.html b/docs/reference/subjectSplitter.html deleted file mode 100644 index 94a1942d5..000000000 --- a/docs/reference/subjectSplitter.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - -Split data when patients are in the data multiple times such that the same patient is always either in the -train set or the test set (the same patient cannot be in both the test and train set at different times) — subjectSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Split data when patients are in the data multiple times such that the same patient is always either in the -train set or the test set (the same patient cannot be in both the test and train set at different times)

-
- -
subjectSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
population

An object created using createStudyPopulation().

test

A real number between 0 and 1 indicating the test set fraction of the data

train

A real number between 0 and 1 indicating the train set fraction of the data. -If not set train is equal to 1 - test

nfold

An integer >= 1 specifying the number of folds used in cross validation

seed

If set a fixed seed is used, otherwise a random split is performed

- -

Value

- -

A dataframe containing the columns: rowId and index

-

Details

- -

Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/summaryPlpAnalyses.html b/docs/reference/summaryPlpAnalyses.html deleted file mode 100644 index 3f9fd8e36..000000000 --- a/docs/reference/summaryPlpAnalyses.html +++ /dev/null @@ -1,207 +0,0 @@ - - - - - - - - -summarises the multiple PLP results into a dataframe — summaryPlpAnalyses • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

summarises the multiple PLP results into a dataframe

- -
- -
summaryPlpAnalyses(analysesLocation)
- -

Arguments

- - - - - - -
analysesLocation

The directory containing the results (with the analysis_x folders)

- -

Details

- -

Loads all the study results contained in the analysesLocation and aggregates a summary of the results

- - -
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/timeSplitter.html b/docs/reference/timeSplitter.html deleted file mode 100644 index 216968c66..000000000 --- a/docs/reference/timeSplitter.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - - -Split test/train data by time and then partitions training set into random folds stratified by -class — timeSplitter • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Split test/train data by time and then partitions training set into random folds stratified by -class

-
- -
timeSplitter(population, test = 0.3, train = NULL, nfold = 3, seed = NULL)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
population

An object created using createStudyPopulation().

test

A real number between 0 and 1 indicating the test set fraction of the data

train

A real number between 0 and 1 indicating the training set fraction of the data

nfold

An integer >= 1 specifying the number of folds used in cross validation

seed

If set a fixed seed is used, otherwise a random split is performed

- -

Value

- -

A dataframe containing the columns: rowId and index

-

Details

- -

Returns a dataframe of rowIds and indexes with a -1 index indicating the rowId belongs to the test -set and a positive integer index value indicating the rowId's cross valiation fold within the train -set.

- -
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/toPlpData.html b/docs/reference/toPlpData.html deleted file mode 100644 index 6d54bce3a..000000000 --- a/docs/reference/toPlpData.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - -Convert matrix into plpData — toPlpData • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Converts a matrix (rows = people, columns=variables) into the standard plpData

- -
- -
toPlpData(data, columnInfo, outcomeId, outcomeThreshold = 0.5,
-  indexTime = 0, includeIndexDay = T)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
data

An data.frame or matrix.

columnInfo

A dataframe with three columns, column 1 contains columnId, column 2 contains columnName for each column id and column 3 contains the columnTime - the time prior to index the variable was recorded

outcomeId

The column id containing the outcome

outcomeThreshold

The outcome value must be higher or equal to this for the person to have the outcome

indexTime

The time defining the index date

includeIndexDay

Boolean - whether to include variables recorded on index date

- -

Value

- -

Returns an object of class plpData

- -

Details

- -

This function converts matrix into plpData

- - -

Examples

-
#TODO - -
-
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/toSparseM.html b/docs/reference/toSparseM.html index aa2b22c83..7c240da30 100644 --- a/docs/reference/toSparseM.html +++ b/docs/reference/toSparseM.html @@ -1,67 +1,12 @@ - - - - - - - -Convert the plpData in COO format into a sparse R matrix — toSparseM • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Convert the plpData in COO format into a sparse R matrix — toSparseM • PatientLevelPrediction - + + - - - -
-
- -
- -
+
@@ -183,71 +131,76 @@

Convert the plpData in COO format into a sparse R matrix

Converts the standard plpData to a sparse matrix

-
toSparseM(plpData, cohort = NULL, map = NULL)
- -

Arguments

- - - - - - - - - - - - - - -
plpData

An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

cohort

If specified the plpData is restricted to the rowIds in the cohort (otherwise plpData$labels is used)

map

A covariate map (telling us the column number for covariates)

- -

Value

- -

Returns a list, containing the data as a sparse matrix, the plpData covariateRef +

+
toSparseM(plpData, cohort = NULL, map = NULL)
+
+ +
+

Arguments

+
plpData
+

An object of type plpData with covariate in coo format - the patient level prediction +data extracted from the CDM.

+ + +
cohort
+

If specified the plpData is restricted to the rowIds in the cohort (otherwise plpData$labels is used)

+ + +
map
+

A covariate map (telling us the column number for covariates)

+ +
+
+

Value

+ + +

Returns a list, containing the data as a sparse matrix, the plpData covariateRef and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

-
data

A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

-
covariateRef

The plpData covariateRef.

-
map

A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

+This object is a list with the following components:

data
+

A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

-
+
covariateRef
+

The plpData covariateRef.

-

Details

+
map
+

A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

+ +
+
+

Details

This function converts the covariate file from ffdf in COO format into a sparse matrix from the package Matrix

+
-

Examples

-
#TODO - -
+
+

Examples

+
#TODO
+
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/toSparsePython.html b/docs/reference/toSparsePython.html deleted file mode 100644 index 638d441e8..000000000 --- a/docs/reference/toSparsePython.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse python matrix — toSparsePython • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Converts the standard plpData to a sparse matrix firectly into python

- -
- -
toSparsePython(plpData, population, map = NULL, temporal = F,
-  pythonExePath = NULL)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

population

The population to include in the matrix

map

A covariate map (telling us the column number for covariates)

temporal

Whether to include timeId into tensor

pythonExePath

Location of python exe you want to use

- -

Value

- -

Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

-
data

The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

-
covariateRef

The plpData covariateRef.

-
map

A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

-
- - -

Details

- -

This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

- - -

Examples

-
#TODO - -
-
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/toSparseTorchPython.html b/docs/reference/toSparseTorchPython.html deleted file mode 100644 index 64c4fb439..000000000 --- a/docs/reference/toSparseTorchPython.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse python matrix using torch.sparse — toSparseTorchPython • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Converts the standard plpData to a sparse matrix firectly into python

-
- -
toSparseTorchPython(
-  plpData,
-  population,
-  map = NULL,
-  temporal = F,
-  pythonExePath = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

population

The population to include in the matrix

map

A covariate map (telling us the column number for covariates)

temporal

Whether to include timeId into tensor

pythonExePath

Location of python exe you want to use

- -

Value

- -

Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

-
data

The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

-
covariateRef

The plpData covariateRef.

-
map

A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

- -
- -

Details

- -

This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

- -

Examples

-
#TODO - -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/toSparseTorchPython2.html b/docs/reference/toSparseTorchPython2.html deleted file mode 100644 index 79b057c34..000000000 --- a/docs/reference/toSparseTorchPython2.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - -Convert the plpData in COO format into a sparse python matrix using torch.sparse — toSparseTorchPython2 • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

Converts the standard plpData to a sparse matrix firectly into python

- -
- -
toSparseTorchPython2(plpData, population, map = NULL, temporal = F,
-  pythonExePath = NULL)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
plpData

An object of type plpData with covariate in coo format - the patient level prediction -data extracted from the CDM.

population

The population to include in the matrix

map

A covariate map (telling us the column number for covariates)

temporal

Whether to include timeId into tensor

pythonExePath

Location of python exe you want to use

- -

Value

- -

Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -and a data.frame named map that tells us what covariate corresponds to each column -This object is a list with the following components:

-
data

The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.

-
covariateRef

The plpData covariateRef.

-
map

A data.frame containing the data column ids and the corresponding covariateId from covariateRef.

-
- - -

Details

- -

This function converts the covariate file from ffdf in COO format into a sparse matrix from -the package Matrix

- - -

Examples

-
#TODO - -
-
- -
- -
- - -
-

Site built with pkgdown 1.3.0.

-
-
-
- - - - - - diff --git a/docs/reference/transferLearning.html b/docs/reference/transferLearning.html deleted file mode 100644 index 1cf29eb56..000000000 --- a/docs/reference/transferLearning.html +++ /dev/null @@ -1,279 +0,0 @@ - - - - - - - - -[Under development] Transfer learning — transferLearning • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

[Under development] Transfer learning

-
- -
transferLearning(
-  plpResult,
-  plpData,
-  population,
-  fixLayers = T,
-  includeTop = F,
-  addLayers = c(100, 10),
-  layerDropout = c(T, T),
-  layerActivation = c("relu", "softmax"),
-  outcomeWeight = 1,
-  batchSize = 10000,
-  epochs = 20
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

The plp result when training a kersa deep learning model on big data

plpData

The new data to fine tune the model on

population

The population for the new data

fixLayers

boolean specificying whether to fix weights in model being transferred

includeTop

If TRUE the final layer of the model being transferred is removed

addLayers

vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10

layerDropout

Add dropout to each new layer (binary vector length of addLayers)

layerActivation

Activation function for each new layer (string vector length of addLayers)

outcomeWeight

The weight to assign the class 1 when training the model

batchSize

Size of each batch for updating layers

epochs

Number of epoches to run

- - -

Examples

-
if (FALSE) { -modelSet <- setDeepNN() -plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...) - -transferLearning(...) -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/transportModel.html b/docs/reference/transportModel.html deleted file mode 100644 index 166870fe2..000000000 --- a/docs/reference/transportModel.html +++ /dev/null @@ -1,232 +0,0 @@ - - - - - - - - -Transports a plpModel to a new location and removes sensitive data — transportModel • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Transports a plpModel to a new location and removes sensitive data

-
- -
transportModel(plpModel, outputFolder)
- -

Arguments

- - - - - - - - - - -
plpModel

A trianed model.

outputFolder

The folder on the file system where the CSV files will be created. If the -folder does not yet exist it will be created.

- -

Details

- -

This function is used to

- -

Examples

-
if (FALSE) { -transportModel(plpModel, "s:/temp/exportTest") -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/transportPlp.html b/docs/reference/transportPlp.html deleted file mode 100644 index 57f63a709..000000000 --- a/docs/reference/transportPlp.html +++ /dev/null @@ -1,290 +0,0 @@ - - - - - - - - -Transports a plpResult to a new location and removed sensitive data — transportPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
-
- - -
-

Transports a plpResult to a new location and removed sensitive data

-
- -
transportPlp(
-  plpResult,
-  modelName = NULL,
-  dataName = NULL,
-  outputFolder,
-  n = NULL,
-  includeEvaluationStatistics = T,
-  includeThresholdSummary = T,
-  includeDemographicSummary = T,
-  includeCalibrationSummary = T,
-  includePredictionDistribution = T,
-  includeCovariateSummary = T,
-  save = T,
-  reduceSize = F
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
plpResult

An object returned by running runPlp.

modelName

A string of the name of the model

dataName

A string of the name of the data

outputFolder

The folder on the file system where the CSV files will be created. If the -folder does not yet exist it will be created.

n

The minimum number of people required for each result summary to be included

includeEvaluationStatistics

Whether to include the evaluationStatistics

includeThresholdSummary

Whether to include the thresholdSummary

includeDemographicSummary

Whether to include the demographicSummary

includeCalibrationSummary

Whether to include the calibrationSummary

includePredictionDistribution

Whether to include the predictionDistribution

includeCovariateSummary

Whether to include the covariateSummary

save

Whether to save the result or just return the transportable object

reduceSize

Remove parts of runPlp object that are not needed but take up space

- -

Details

- -

This function is used to

- -

Examples

-
if (FALSE) { -transportPlp(plpResult, "s:/temp/exportTest", n=10) -} -
-
- -
- - -
- - -
-

Site built with pkgdown 1.6.1.

-
- -
-
- - - - - - - - diff --git a/docs/reference/validateExternal.html b/docs/reference/validateExternal.html new file mode 100644 index 000000000..ce79f9c15 --- /dev/null +++ b/docs/reference/validateExternal.html @@ -0,0 +1,188 @@ + +externalValidatePlp - Validate model performance on new data — validateExternal • PatientLevelPrediction + + +
+
+ + + +
+
+ + +
+

externalValidatePlp - Validate model performance on new data

+
+ +
+
validateExternal(
+  validationDesignList,
+  databaseDetails,
+  logSettings,
+  outputFolder
+)
+
+ +
+

Arguments

+
validationDesignList
+

A list of objects created with createValidationDesign

+ + +
databaseDetails
+

A list of objects of class +databaseDetails created using createDatabaseDetails

+ + +
logSettings
+

An object of logSettings created +using createLogSettings

+ + +
outputFolder
+

The directory to save the validation results to +(subfolders are created per database in validationDatabaseDetails)

+ +
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/validateMultiplePlp.html b/docs/reference/validateMultiplePlp.html index 06aac497f..c75da7891 100644 --- a/docs/reference/validateMultiplePlp.html +++ b/docs/reference/validateMultiplePlp.html @@ -1,68 +1,13 @@ - - - - - - - -externally validate the multiple plp models across new datasets — validateMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -externally validate the multiple plp models across new datasets — validateMultiplePlp • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -185,69 +133,70 @@

externally validate the multiple plp models across new datasets

validates the models on new data

-
validateMultiplePlp(
-  analysesLocation,
-  validationDatabaseDetails,
-  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
-  recalibrate = NULL,
-  saveDirectory = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
analysesLocation

The location where the multiple plp analyses are

validationDatabaseDetails

The validation database settings created using createDatabaseDetails()

validationRestrictPlpDataSettings

The settings specifying the extra restriction settings when extracting the data created using createRestrictPlpDataSettings().

recalibrate

A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')

saveDirectory

The location to save to validation results

- -

Details

+
+
validateMultiplePlp(
+  analysesLocation,
+  validationDatabaseDetails,
+  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
+  recalibrate = NULL,
+  cohortDefinitions = NULL,
+  saveDirectory = NULL
+)
+
+ +
+

Arguments

+
analysesLocation
+

The location where the multiple plp analyses are

+ +
validationDatabaseDetails
+

A single or list of validation database settings created using createDatabaseDetails()

+ + +
validationRestrictPlpDataSettings
+

The settings specifying the extra restriction settings when extracting the data created using createRestrictPlpDataSettings().

+ + +
recalibrate
+

A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')

+ + +
cohortDefinitions
+

A list of cohortDefinitions

+ + +
saveDirectory
+

The location to save to validation results

+ +
+
+

Details

Users need to input a location where the results of the multiple plp analyses are found and the connection and database settings for the new data

+
+
-
- +
- - + + diff --git a/docs/reference/viewDatabaseResultPlp.html b/docs/reference/viewDatabaseResultPlp.html index 0b4be6fe9..6ff722a2e 100644 --- a/docs/reference/viewDatabaseResultPlp.html +++ b/docs/reference/viewDatabaseResultPlp.html @@ -1,67 +1,12 @@ - - - - - - - -open a local shiny app for viewing the result of a PLP analyses from a database — viewDatabaseResultPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -open a local shiny app for viewing the result of a PLP analyses from a database — viewDatabaseResultPlp • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,78 +131,74 @@

open a local shiny app for viewing the result of a PLP analyses from a datab

open a local shiny app for viewing the result of a PLP analyses from a database

-
viewDatabaseResultPlp(
-  mySchema,
-  myServer,
-  myUser,
-  myPassword,
-  myDbms,
-  myPort = NULL,
-  myTableAppend
-)
+
+
viewDatabaseResultPlp(
+  mySchema,
+  myServer,
+  myUser,
+  myPassword,
+  myDbms,
+  myPort = NULL,
+  myTableAppend
+)
+
+ +
+

Arguments

+
mySchema
+

Database result schema containing the result tables

-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
mySchema

Database result schema containing the result tables

myServer

server with the result database

myUser

Username for the connection to the result database

myPassword

Password for the connection to the result database

myDbms

database management system for the result database

myPort

Port for the connection to the result database

myTableAppend

A string appended to the results tables (optional)

-

Details

+
myServer
+

server with the result database

+ +
myUser
+

Username for the connection to the result database

+ + +
myPassword
+

Password for the connection to the result database

+ + +
myDbms
+

database management system for the result database

+ + +
myPort
+

Port for the connection to the result database

+ + +
myTableAppend
+

A string appended to the results tables (optional)

+ +
+
+

Details

Opens a shiny app for viewing the results of the models from a database

+
+
-
- +
- - + + diff --git a/docs/reference/viewMultiplePlp.html b/docs/reference/viewMultiplePlp.html index 89200fb9e..60cbf1a87 100644 --- a/docs/reference/viewMultiplePlp.html +++ b/docs/reference/viewMultiplePlp.html @@ -1,67 +1,12 @@ - - - - - - - -open a local shiny app for viewing the result of a multiple PLP analyses — viewMultiplePlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -open a local shiny app for viewing the result of a multiple PLP analyses — viewMultiplePlp • PatientLevelPrediction - - - - + + -
-
- -
- -
+
@@ -183,47 +131,43 @@

open a local shiny app for viewing the result of a multiple PLP analyses

open a local shiny app for viewing the result of a multiple PLP analyses

-
viewMultiplePlp(analysesLocation)
- -

Arguments

- - - - - - -
analysesLocation

The directory containing the results (with the analysis_x folders)

+
+
viewMultiplePlp(analysesLocation)
+
-

Details

+
+

Arguments

+
analysesLocation
+

The directory containing the results (with the analysis_x folders)

+
+
+

Details

Opens a shiny app for viewing the results of the models from various T,O, Tar and settings settings.

+
+
-
- +
- - + + diff --git a/docs/reference/viewPlp.html b/docs/reference/viewPlp.html index 7ad734083..5dc710218 100644 --- a/docs/reference/viewPlp.html +++ b/docs/reference/viewPlp.html @@ -1,67 +1,12 @@ - - - - - - - -viewPlp - Interactively view the performance and model settings — viewPlp • PatientLevelPrediction - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -viewPlp - Interactively view the performance and model settings — viewPlp • PatientLevelPrediction - - + + - - -
-
- -
- -
+
@@ -183,53 +131,56 @@

viewPlp - Interactively view the performance and model settings

This is a shiny app for viewing interactive plots of the performance and the settings

-
viewPlp(runPlp, validatePlp = NULL)
+
+
viewPlp(runPlp, validatePlp = NULL, diagnosePlp = NULL)
+
+ +
+

Arguments

+
runPlp
+

The output of runPlp() (an object of class 'runPlp')

+ -

Arguments

- - - - - - - - - - -
runPlp

The output of runPlp() (an object of class 'runPlp')

validatePlp

The output of externalValidatePlp (on object of class 'validatePlp')

+
validatePlp
+

The output of externalValidatePlp (on object of class 'validatePlp')

-

Value

-

Opens a shiny app for interactively viewing the results

-

Details

+
diagnosePlp
+

The output of diagnosePlp()

+
+
+

Value

+ + +

Opens a shiny app for interactively viewing the results

+
+
+

Details

Either the result of runPlp and view the plots

+
+
-
- +
- - + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 000000000..5c677f919 --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,456 @@ + + + + /404.html + + + /articles/AddingCustomFeatureEngineering.html + + + /articles/AddingCustomModels.html + + + /articles/AddingCustomSamples.html + + + /articles/AddingCustomSplitting.html + + + /articles/BenchmarkTasks.html + + + /articles/BestPractices.html + + + /articles/BuildingMultiplePredictiveModels.html + + + /articles/BuildingPredictiveModels.html + + + /articles/ClinicalModels.html + + + /articles/ConstrainedPredictors.html + + + /articles/CreatingLearningCurves.html + + + /articles/CreatingNetworkStudies.html + + + /articles/InstallationGuide.html + + + /articles/Videos.html + + + /articles/index.html + + + /authors.html + + + /index.html + + + /news/index.html + + + /reference/MapIds.html + + + /reference/PatientLevelPrediction.html + + + /reference/accuracy.html + + + /reference/addDiagnosePlpToDatabase.html + + + /reference/addMultipleDiagnosePlpToDatabase.html + + + /reference/addMultipleRunPlpToDatabase.html + + + /reference/addRunPlpToDatabase.html + + + /reference/averagePrecision.html + + + /reference/brierScore.html + + + /reference/calibrationLine.html + + + /reference/computeAuc.html + + + /reference/computeGridPerformance.html + + + /reference/configurePython.html + + + /reference/covariateSummary.html + + + /reference/createCohortCovariateSettings.html + + + /reference/createDatabaseDetails.html + + + /reference/createDatabaseList.html + + + /reference/createDatabaseSchemaSettings.html + + + /reference/createDefaultExecuteSettings.html + + + /reference/createDefaultSplitSetting.html + + + /reference/createExecuteSettings.html + + + /reference/createFeatureEngineeringSettings.html + + + /reference/createLearningCurve.html + + + /reference/createLogSettings.html + + + /reference/createModelDesign.html + + + /reference/createPlpResultTables.html + + + /reference/createPreprocessSettings.html + + + /reference/createRandomForestFeatureSelection.html + + + /reference/createRestrictPlpDataSettings.html + + + /reference/createSampleSettings.html + + + /reference/createSplineSettings.html + + + /reference/createStratifiedImputationSettings.html + + + /reference/createStudyPopulation.html + + + /reference/createStudyPopulationSettings.html + + + /reference/createTempModelLoc.html + + + /reference/createUnivariateFeatureSelection.html + + + /reference/createValidationDesign.html + + + /reference/createValidationSettings.html + + + /reference/diagnoseMultiplePlp.html + + + /reference/diagnosePlp.html + + + /reference/diagnosticOddsRatio.html + + + /reference/evaluatePlp.html + + + /reference/externalValidateDbPlp.html + + + /reference/extractDatabaseToCsv.html + + + /reference/f1Score.html + + + /reference/falseDiscoveryRate.html + + + /reference/falseNegativeRate.html + + + /reference/falseOmissionRate.html + + + /reference/falsePositiveRate.html + + + /reference/fitPlp.html + + + /reference/getCalibrationSummary.html + + + /reference/getCohortCovariateData.html + + + /reference/getDemographicSummary.html + + + /reference/getPlpData.html + + + /reference/getPredictionDistribution.html + + + /reference/getPredictionDistribution_binary.html + + + /reference/getThresholdSummary.html + + + /reference/getThresholdSummary_binary.html + + + /reference/ici.html + + + /reference/index.html + + + /reference/insertCsvToDatabase.html + + + /reference/insertModelDesignInDatabase.html + + + /reference/insertResultsToSqlite.html + + + /reference/listAppend.html + + + /reference/listCartesian.html + + + /reference/loadPlpAnalysesJson.html + + + /reference/loadPlpData.html + + + /reference/loadPlpModel.html + + + /reference/loadPlpResult.html + + + /reference/loadPlpShareable.html + + + /reference/loadPrediction.html + + + /reference/migrateDataModel.html + + + /reference/modelBasedConcordance.html + + + /reference/negativeLikelihoodRatio.html + + + /reference/negativePredictiveValue.html + + + /reference/outcomeSurvivalPlot.html + + + /reference/pfi.html + + + /reference/plotDemographicSummary.html + + + /reference/plotF1Measure.html + + + /reference/plotGeneralizability.html + + + /reference/plotLearningCurve.html + + + /reference/plotPlp.html + + + /reference/plotPrecisionRecall.html + + + /reference/plotPredictedPDF.html + + + /reference/plotPredictionDistribution.html + + + /reference/plotPreferencePDF.html + + + /reference/plotSmoothCalibration.html + + + /reference/plotSparseCalibration.html + + + /reference/plotSparseCalibration2.html + + + /reference/plotSparseRoc.html + + + /reference/plotVariableScatterplot.html + + + /reference/plpDataSimulationProfile.html + + + /reference/positiveLikelihoodRatio.html + + + /reference/positivePredictiveValue.html + + + /reference/predictCyclops.html + + + /reference/predictPlp.html + + + /reference/preprocessData.html + + + /reference/recalibratePlp.html + + + /reference/recalibratePlpRefit.html + + + /reference/runMultiplePlp.html + + + /reference/runPlp.html + + + /reference/savePlpAnalysesJson.html + + + /reference/savePlpData.html + + + /reference/savePlpModel.html + + + /reference/savePlpResult.html + + + /reference/savePlpShareable.html + + + /reference/savePrediction.html + + + /reference/sensitivity.html + + + /reference/setAdaBoost.html + + + /reference/setCoxModel.html + + + /reference/setDecisionTree.html + + + /reference/setGradientBoostingMachine.html + + + /reference/setIterativeHardThresholding.html + + + /reference/setKNN.html + + + /reference/setLassoLogisticRegression.html + + + /reference/setLightGBM.html + + + /reference/setMLP.html + + + /reference/setNaiveBayes.html + + + /reference/setPythonEnvironment.html + + + /reference/setRandomForest.html + + + /reference/setSVM.html + + + /reference/simulatePlpData.html + + + /reference/sklearnFromJson.html + + + /reference/sklearnToJson.html + + + /reference/specificity.html + + + /reference/splitData.html + + + /reference/toSparseM.html + + + /reference/validateExternal.html + + + /reference/validateMultiplePlp.html + + + /reference/viewDatabaseResultPlp.html + + + /reference/viewMultiplePlp.html + + + /reference/viewPlp.html + + diff --git a/extras/PatientLevelPrediction.pdf b/extras/PatientLevelPrediction.pdf new file mode 100644 index 000000000..836c33b35 Binary files /dev/null and b/extras/PatientLevelPrediction.pdf differ diff --git a/extras/checkPlpInstallation.R b/extras/checkPlpInstallation.R index 5f0409c02..55de984f3 100644 --- a/extras/checkPlpInstallation.R +++ b/extras/checkPlpInstallation.R @@ -30,7 +30,7 @@ checkPlpInstallation <- function(connectionDetails=NULL, python=T) { sampleSize <- 2000 plpData <- simulatePlpData(plpDataSimulationProfile, n = sampleSize) - popSettings <- createStudyPopulationSettings( + populationSettings <- createStudyPopulationSettings( firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome = FALSE, @@ -47,7 +47,7 @@ checkPlpInstallation <- function(connectionDetails=NULL, python=T) { population <- tryCatch({createStudyPopulation( plpData = plpData, outcomeId = 2, - populationSettings = popSettings + populationSettings = populationSettings )}, error = function(e) { return(0) diff --git a/extras/testAllClassifiers.R b/extras/testAllClassifiers.R index dcc6ac44c..682232c49 100644 --- a/extras/testAllClassifiers.R +++ b/extras/testAllClassifiers.R @@ -1,3 +1,4 @@ +library(PatientLevelPrediction) connectionDetails <- Eunomia::getEunomiaConnectionDetails() Eunomia::createCohorts(connectionDetails) @@ -20,7 +21,7 @@ databaseDetails <- createDatabaseDetails( cohortTable = "cohort", outcomeDatabaseSchema = "main", outcomeTable = "cohort", - cohortId = 1, + targetId = 1, outcomeIds = 3, #make this ids cdmVersion = 5) @@ -67,7 +68,7 @@ plpResultEunomia9 <- PatientLevelPrediction::runPlp( modelSettings = setKNN(), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest8') + saveDirectory = file.path(tempdir(), 'EunomiaTest9') ) plpResultEunomia8 <- PatientLevelPrediction::runPlp( @@ -83,8 +84,9 @@ plpResultEunomia8 <- PatientLevelPrediction::runPlp( modelSettings = setSVM(), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest7') + saveDirectory = file.path(tempdir(), 'EunomiaTest8') ) +# issue with loading json - fixed by saving as pickle plpResultEunomia7 <- PatientLevelPrediction::runPlp( plpData = plpDataEunomia, @@ -99,7 +101,7 @@ plpResultEunomia7 <- PatientLevelPrediction::runPlp( modelSettings = setRandomForest(), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest6') + saveDirectory = file.path(tempdir(), 'EunomiaTest7') ) plpResultEunomia6 <- PatientLevelPrediction::runPlp( @@ -112,11 +114,12 @@ plpResultEunomia6 <- PatientLevelPrediction::runPlp( sampleSettings = createSampleSettings(), featureEngineeringSettings = createFeatureEngineeringSettings(), preprocessSettings = createPreprocessSettings(), - modelSettings = setMLP(), + modelSettings = setMLP(hiddenLayerSizes = list(c(10))), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest5') + saveDirectory = file.path(tempdir(), 'EunomiaTest6') ) +# invalid hiddenLayerSizes can cause error plpResultEunomia5 <- PatientLevelPrediction::runPlp( plpData = plpDataEunomia, @@ -131,9 +134,9 @@ plpResultEunomia5 <- PatientLevelPrediction::runPlp( modelSettings = setNaiveBayes(), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest4') + saveDirectory = file.path(tempdir(), 'EunomiaTest5') ) - +# worked plpResultEunomia3 <- PatientLevelPrediction::runPlp( plpData = plpDataEunomia, @@ -148,8 +151,9 @@ plpResultEunomia3 <- PatientLevelPrediction::runPlp( modelSettings = setAdaBoost(), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest2') + saveDirectory = file.path(tempdir(), 'EunomiaTest3') ) +# worked plpResultEunomia4 <- PatientLevelPrediction::runPlp( plpData = plpDataEunomia, @@ -161,12 +165,12 @@ plpResultEunomia4 <- PatientLevelPrediction::runPlp( sampleSettings = createSampleSettings(), featureEngineeringSettings = createFeatureEngineeringSettings(), preprocessSettings = createPreprocessSettings(), - modelSettings = setDecisionTree(maxFeatures = list(50,'auto', NULL)), + modelSettings = setDecisionTree(maxFeatures = list(50,'sqrt', NULL)), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest3') + saveDirectory = file.path(tempdir(), 'EunomiaTest4') ) - +# DT error! plpResultEunomia2 <- PatientLevelPrediction::runPlp( @@ -184,10 +188,10 @@ plpResultEunomia2 <- PatientLevelPrediction::runPlp( nthread = c(10), earlyStopRound = c(25), maxDepth = c(4), - minRows = c(5), learnRate = c(0.2) ), logSettings = createLogSettings(), executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(tempdir(), 'EunomiaTest') + saveDirectory = file.path(tempdir(), 'EunomiaTest2') ) +# worked \ No newline at end of file diff --git a/inst/doc/AddingCustomAlgorithms.tex b/inst/doc/AddingCustomAlgorithms.tex deleted file mode 100644 index 125ff3257..000000000 --- a/inst/doc/AddingCustomAlgorithms.tex +++ /dev/null @@ -1,617 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Adding Custom Patient-Level Prediction Algorithms}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Adding Custom Patient-Level Prediction Algorithms} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -This vignette describes how you can add your own custom algorithms in -the Observational Health Data Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package. This allows you to fully leverage the OHDSI -PatientLevelPrediction framework for model development and validation. -This vignette assumes you have read and are comfortable with building -single patient level prediction models as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -\textbf{We invite you to share your new algorithms with the OHDSI -community through our -\href{http://github.com/OHDSI/PatientLevelPrediction}{GitHub -repository}.} - -\hypertarget{algorithm-code-structure}{% -\section{Algorithm Code Structure}\label{algorithm-code-structure}} - -Each algorithm in the package should be implemented in its own -\textless Name\textgreater.R file, e.g.~KNN.R, containing a -set\textless Name\textgreater{} function and a -fit\textless Name\textgreater{} function. Furthermore, a corresponding -predict function in predict.R is needed (if there isn't one available -that would work, see example at the end of the document). We will now -describe each of these functions in more detail below. - -\hypertarget{set}{% -\subsection{Set}\label{set}} - -The set\textless Name\textgreater{} is a function that takes as input -the different hyper-parameter values to do a grid search when training. -The output of the functions needs to be a list as class -\texttt{modelSettings} containing: - -\begin{itemize} -\tightlist -\item - param - all the combinations of the hyper-parameter values input -\item - model - a string specifying what function to call to fit the model -\item - name - a string containing the name of the model. -\end{itemize} - -For example, if you were adding a model called madeUp that has two -hyper-parameters then the set function should be: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{setMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(}\DataTypeTok{a=}\DecValTok{1}\NormalTok{, }\DataTypeTok{b=}\DecValTok{2}\NormalTok{, }\DataTypeTok{seed=}\OtherTok{NULL}\NormalTok{)\{} - \CommentTok{# add input checks here...} - - \CommentTok{# now create list of all combinations:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'fitMadeUp'}\NormalTok{, }\CommentTok{# this will be called to train the made up model} - \DataTypeTok{param=} \KeywordTok{split}\NormalTok{(}\KeywordTok{expand.grid}\NormalTok{(}\DataTypeTok{a=}\NormalTok{a, } - \DataTypeTok{b=}\NormalTok{b,} - \DataTypeTok{seed=}\KeywordTok{ifelse}\NormalTok{(}\KeywordTok{is.null}\NormalTok{(seed),}\StringTok{'NULL'}\NormalTok{, seed)),} - \DecValTok{1}\OperatorTok{:}\NormalTok{(}\KeywordTok{length}\NormalTok{(a)}\OperatorTok{*}\KeywordTok{length}\NormalTok{(b) )),} - \DataTypeTok{name=}\StringTok{'Made Up Algorithm'} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'modelSettings'} - - \KeywordTok{return}\NormalTok{(result)} -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{fit}{% -\subsection{Fit}\label{fit}} - -This function should train your custom model for each parameter entry, -pick the best parameters and train a final model for that setting. - -The fit\textless Model\textgreater{} should have as inputs: - -\begin{itemize} -\tightlist -\item - population - the study popualation the model is being developed on -\item - plpData - the plpData object -\item - param - the hyper-parameters as a list of all combinations -\item - quiet - T or F indicating whether to output progress -\item - outcomeId - the outcome id -\item - cohortId - the target population id -\end{itemize} - -The fit function should return a list of class \texttt{plpModel} with -the following objects: - -\begin{itemize} -\tightlist -\item - model - a trained model -\item - modelSettings - a list containing the model and input param -\item - trainCVAuc - a value with the train AUC value -\item - hyperParamSearch - a dataframe with the hyperparameter grid and - corresponding AUCs -\item - metaData - the metaData from the plpData object -\item - populationSettings - the settings used to create the population and - define the time-at-risk -\item - outcomeId - the outcomeId being predicted -\item - cohortId - the cohortId corresponding to the target cohort -\item - varImp - a dataframe with the covaraites and a measure of importance -\item - trainingTime - how long it took to develop/evaluate the model -\item - covariateMap - if the plpData are converted to a matrix for model - compatibility this tells us what covariate each row in the matrix - correpsonds to and is need when implementing the model on new data -\end{itemize} - -The plpModel returned by fit also has a type attribute, this points to -the predict function, for example -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}madeup\textquotesingle{}} -means when the model is applied to new data, the `predict.madeup' -function in Predict.R is called. if this doesnt exist, then the model -will fail. Another attribute is the predictionType -\texttt{attr(result,\ \textquotesingle{}predictionType\textquotesingle{})\ \textless{}-\ \textquotesingle{}binary\textquotesingle{}} -this is currently not needed but may be important in the future when we -expand to regression or multiclass classification. - -For example: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{fitMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(population, plpData, param, }\DataTypeTok{quiet=}\NormalTok{F,} -\NormalTok{ outcomeId, cohortId, ...)\{} - - \CommentTok{# **************** code to train the model here} - \CommentTok{# trainedModel <- this code should apply each hyper-parameter using the cross validation} - \CommentTok{# then pick out the best hyper-parameter setting} - \CommentTok{# and finally fit a model on the whole train data using the } - \CommentTok{# optimal hyper-parameter settings} - \CommentTok{# ****************} - - \CommentTok{# construct the standard output for a model:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model =}\NormalTok{ trainedModel,} - \DataTypeTok{modelSettings =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'made_up'}\NormalTok{, }\DataTypeTok{modelParameters=}\NormalTok{param),} - \DataTypeTok{trainCVAuc =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{hyperParamSearch =}\NormalTok{ hyperSummary,} - \DataTypeTok{metaData =}\NormalTok{ plpData}\OperatorTok{$}\NormalTok{metaData,} - \DataTypeTok{populationSettings =} \KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{),} - \DataTypeTok{outcomeId=}\NormalTok{outcomeId,}\CommentTok{# can use populationSettings$outcomeId?} - \DataTypeTok{cohortId=}\NormalTok{cohortId,} - \DataTypeTok{varImp =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{trainingTime=}\NormalTok{comp,} - \DataTypeTok{covariateMap=}\NormalTok{result}\OperatorTok{$}\NormalTok{map} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'plpModel'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'type'}\NormalTok{) <-}\StringTok{ 'madeup'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'predictionType'}\NormalTok{) <-}\StringTok{ 'binary'} - \KeywordTok{return}\NormalTok{(result)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -You could make the fitMadeUp function cleaner by adding helper function -in the MadeUp.R file that are called by the fit function. As the end of -the fit function specified -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}madeup\textquotesingle{}} -we also need to make sure there is a \texttt{predict.madeup} function in -Predict.R: - -\hypertarget{predict}{% -\subsection{Predict}\label{predict}} - -The prediction function takes as input the plpModel returned by fit, a -population and corresponding plpData. It returns a data.frame with the -columns: - -\begin{itemize} -\tightlist -\item - rowId - the id for each person in the population -\item - value - the predicted risk from the plpModel -\end{itemize} - -If the population contains the columns outcomeCount and indexes, then -these are also in the output. - -For example: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{predict.madeup <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(plpModel,population, plpData, ...)\{ } - - \CommentTok{# ************* code to do prediction for each rowId in population} - \CommentTok{# prediction <- code to do prediction here returning columns: rowId } - \CommentTok{# and value (predicted risk)} - \CommentTok{#**************} - -\NormalTok{ prediction <-}\StringTok{ }\KeywordTok{merge}\NormalTok{(population, prediction, }\DataTypeTok{by=}\StringTok{'rowId'}\NormalTok{)} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{prediction[,}\KeywordTok{colnames}\NormalTok{(prediction)}\OperatorTok{%in%}\KeywordTok{c}\NormalTok{(}\StringTok{'rowId'}\NormalTok{,}\StringTok{'outcomeCount'}\NormalTok{,} - \StringTok{'indexes'}\NormalTok{, }\StringTok{'value'}\NormalTok{)] } - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{) } - \KeywordTok{return}\NormalTok{(prediction)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{algorithm-example}{% -\section{Algorithm Example}\label{algorithm-example}} - -Below a fully functional algorithm example is given, however we highly -recommend you to have a look at the available algorithms in the package. - -\hypertarget{set-1}{% -\subsection{Set}\label{set-1}} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{setMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(}\DataTypeTok{a=}\DecValTok{1}\NormalTok{, }\DataTypeTok{b=}\DecValTok{2}\NormalTok{, }\DataTypeTok{seed=}\OtherTok{NULL}\NormalTok{)\{} - \CommentTok{# check a is valid positive value} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(a))\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'a must be input'}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{class}\NormalTok{(a)}\OperatorTok{%in%}\KeywordTok{c}\NormalTok{(}\StringTok{'numeric'}\NormalTok{,}\StringTok{'integer'}\NormalTok{)\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'a must be numeric'}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(a }\OperatorTok{<}\StringTok{ }\DecValTok{0}\NormalTok{)\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'a must be positive'}\NormalTok{)} -\NormalTok{ \}} - \CommentTok{# check b is numeric} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(b))\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'b must be input'}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{class}\NormalTok{(b)}\OperatorTok{%in%}\KeywordTok{c}\NormalTok{(}\StringTok{'numeric'}\NormalTok{,}\StringTok{'integer'}\NormalTok{)\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'b must be numeric'}\NormalTok{)} -\NormalTok{ \}} - - \CommentTok{# now create list of all combinations:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'fitMadeUp'}\NormalTok{, } - \DataTypeTok{param=} \KeywordTok{split}\NormalTok{(}\KeywordTok{expand.grid}\NormalTok{(}\DataTypeTok{a=}\NormalTok{a, } - \DataTypeTok{b=}\NormalTok{b,} - \DataTypeTok{seed=}\KeywordTok{ifelse}\NormalTok{(}\KeywordTok{is.null}\NormalTok{(seed),}\StringTok{'NULL'}\NormalTok{, seed)),} - \DecValTok{1}\OperatorTok{:}\NormalTok{(}\KeywordTok{length}\NormalTok{(a)}\OperatorTok{*}\KeywordTok{length}\NormalTok{(b) )),} - \DataTypeTok{name=}\StringTok{'Made Up Algorithm'} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'modelSettings'} - - \KeywordTok{return}\NormalTok{(result)} - - -\ErrorTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{fit-1}{% -\subsection{Fit}\label{fit-1}} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{fitMadeUp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(population, plpData, param, }\DataTypeTok{quiet=}\NormalTok{F,} -\NormalTok{ outcomeId, cohortId, ...)\{} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\NormalTok{quiet)} - \KeywordTok{writeLines}\NormalTok{(}\StringTok{'Training Made Up model'}\NormalTok{)} - - \ControlFlowTok{if}\NormalTok{(param[[}\DecValTok{1}\NormalTok{]]}\OperatorTok{$}\NormalTok{seed}\OperatorTok{!=}\StringTok{'NULL'}\NormalTok{)} - \KeywordTok{set.seed}\NormalTok{(param[[}\DecValTok{1}\NormalTok{]]}\OperatorTok{$}\NormalTok{seed)} - - \CommentTok{# check plpData is coo format:} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\StringTok{'ffdf'}\OperatorTok{%in%}\KeywordTok{class}\NormalTok{(plpData}\OperatorTok{$}\NormalTok{covariates) )} - \KeywordTok{stop}\NormalTok{(}\StringTok{'This algorithm requires plpData in coo format'}\NormalTok{)} - -\NormalTok{ metaData <-}\StringTok{ }\KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{)} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes))} -\NormalTok{ population <-}\StringTok{ }\NormalTok{population[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{>}\DecValTok{0}\NormalTok{,]} - \KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{) <-}\StringTok{ }\NormalTok{metaData} - - \CommentTok{# convert data into sparse R Matrix:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{toSparseM}\NormalTok{(plpData,population,}\DataTypeTok{map=}\OtherTok{NULL}\NormalTok{)} -\NormalTok{ data <-}\StringTok{ }\NormalTok{result}\OperatorTok{$}\NormalTok{data} - -\NormalTok{ data <-}\StringTok{ }\NormalTok{data[population}\OperatorTok{$}\NormalTok{rowId,]} - - \CommentTok{# set test/train sets (for printing performance as it trains)} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\NormalTok{quiet)} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Training made up model on train set containing '}\NormalTok{, }\KeywordTok{nrow}\NormalTok{(population), } - \StringTok{' people with '}\NormalTok{,}\KeywordTok{sum}\NormalTok{(population}\OperatorTok{$}\NormalTok{outcomeCount}\OperatorTok{>}\DecValTok{0}\NormalTok{), }\StringTok{' outcomes'}\NormalTok{))} -\NormalTok{ start <-}\StringTok{ }\KeywordTok{Sys.time}\NormalTok{()} - - \CommentTok{#============= STEP 1 ======================================} - \CommentTok{# pick the best hyper-params and then do final training on all data...} - \KeywordTok{writeLines}\NormalTok{(}\StringTok{'train'}\NormalTok{)} -\NormalTok{ datas <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{population=}\NormalTok{population, }\DataTypeTok{data=}\NormalTok{data)} -\NormalTok{ param.sel <-}\StringTok{ }\KeywordTok{lapply}\NormalTok{(param, }\ControlFlowTok{function}\NormalTok{(x) }\KeywordTok{do.call}\NormalTok{(made_up_model, }\KeywordTok{c}\NormalTok{(x,datas) ))} -\NormalTok{ hyperSummary <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(rbind, }\KeywordTok{lapply}\NormalTok{(param.sel, }\ControlFlowTok{function}\NormalTok{(x) x}\OperatorTok{$}\NormalTok{hyperSum))} -\NormalTok{ hyperSummary <-}\StringTok{ }\KeywordTok{as.data.frame}\NormalTok{(hyperSummary)} -\NormalTok{ hyperSummary}\OperatorTok{$}\NormalTok{auc <-}\StringTok{ }\KeywordTok{unlist}\NormalTok{(}\KeywordTok{lapply}\NormalTok{(param.sel, }\ControlFlowTok{function}\NormalTok{(x) x}\OperatorTok{$}\NormalTok{auc)) } -\NormalTok{ param.sel <-}\StringTok{ }\KeywordTok{unlist}\NormalTok{(}\KeywordTok{lapply}\NormalTok{(param.sel, }\ControlFlowTok{function}\NormalTok{(x) x}\OperatorTok{$}\NormalTok{auc))} -\NormalTok{ param <-}\StringTok{ }\NormalTok{param[[}\KeywordTok{which.max}\NormalTok{(param.sel)]]} - - \CommentTok{# set this so you do a final model train } -\NormalTok{ param}\OperatorTok{$}\NormalTok{final=T} - - \KeywordTok{writeLines}\NormalTok{(}\StringTok{'final train'}\NormalTok{)} -\NormalTok{ trainedModel <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(made_up_model, }\KeywordTok{c}\NormalTok{(param,datas) )}\OperatorTok{$}\NormalTok{model} - -\NormalTok{ comp <-}\StringTok{ }\KeywordTok{Sys.time}\NormalTok{() }\OperatorTok{-}\StringTok{ }\NormalTok{start} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\NormalTok{quiet)} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Model Made Up trained - took:'}\NormalTok{, }\KeywordTok{format}\NormalTok{(comp, }\DataTypeTok{digits=}\DecValTok{3}\NormalTok{)))} - - \CommentTok{# construct the standard output for a model:} -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model =}\NormalTok{ trainedModel,} - \DataTypeTok{modelSettings =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\StringTok{'made_up'}\NormalTok{, }\DataTypeTok{modelParameters=}\NormalTok{param),} - \DataTypeTok{trainCVAuc =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{hyperParamSearch =}\NormalTok{ hyperSummary,} - \DataTypeTok{metaData =}\NormalTok{ plpData}\OperatorTok{$}\NormalTok{metaData,} - \DataTypeTok{populationSettings =} \KeywordTok{attr}\NormalTok{(population, }\StringTok{'metaData'}\NormalTok{),} - \DataTypeTok{outcomeId=}\NormalTok{outcomeId,}\CommentTok{# can use populationSettings$outcomeId?} - \DataTypeTok{cohortId=}\NormalTok{cohortId,} - \DataTypeTok{varImp =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{trainingTime=}\NormalTok{comp,} - \DataTypeTok{covariateMap=}\NormalTok{result}\OperatorTok{$}\NormalTok{map} -\NormalTok{ )} - \KeywordTok{class}\NormalTok{(result) <-}\StringTok{ 'plpModel'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'type'}\NormalTok{) <-}\StringTok{ 'madeup'} - \KeywordTok{attr}\NormalTok{(result, }\StringTok{'predictionType'}\NormalTok{) <-}\StringTok{ 'binary'} - \KeywordTok{return}\NormalTok{(result)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{helpers}{% -\subsection{Helpers}\label{helpers}} - -In the fit model a helper function \texttt{made\_up\_model} is called, -this is the function that trains a model given the data and population -(where the popualtion contains a column outcomeCount corresponding to -the outcome). Both the data and population are ordered the same way: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{made_up_model <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(data, population,} - \DataTypeTok{a=}\DecValTok{1}\NormalTok{,}\DataTypeTok{b=}\DecValTok{1}\NormalTok{, }\DataTypeTok{final=}\NormalTok{F, ...)\{} - - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{'Training Made Up model with '}\NormalTok{,}\KeywordTok{length}\NormalTok{(}\KeywordTok{unique}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes)),} - \StringTok{' fold CV'}\NormalTok{))} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes) }\OperatorTok{&&}\StringTok{ }\NormalTok{final}\OperatorTok{==}\NormalTok{F)\{} -\NormalTok{ index_vect <-}\StringTok{ }\KeywordTok{unique}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes)} -\NormalTok{ perform <-}\StringTok{ }\KeywordTok{c}\NormalTok{()} - - \CommentTok{# create prediction matrix to store all predictions} -\NormalTok{ predictionMat <-}\StringTok{ }\NormalTok{population} -\NormalTok{ predictionMat}\OperatorTok{$}\NormalTok{value <-}\StringTok{ }\DecValTok{0} - \KeywordTok{attr}\NormalTok{(predictionMat, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{)} - - \ControlFlowTok{for}\NormalTok{(index }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(index_vect ))\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{'Fold '}\NormalTok{,index, }\StringTok{' -- with '}\NormalTok{, }\KeywordTok{sum}\NormalTok{(population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{!=}\NormalTok{index),} - \StringTok{'train rows'}\NormalTok{))} -\NormalTok{ model <-}\StringTok{ }\NormalTok{madeup}\OperatorTok{::}\KeywordTok{model}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ data[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{!=}\NormalTok{index,],} - \DataTypeTok{y=}\NormalTok{ population}\OperatorTok{$}\NormalTok{outcomeCount[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{!=}\NormalTok{index],} - \DataTypeTok{a=}\NormalTok{a, }\DataTypeTok{b=}\NormalTok{b)} - -\NormalTok{ pred <-}\StringTok{ }\NormalTok{stats}\OperatorTok{::}\KeywordTok{predict}\NormalTok{(model, data[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{==}\NormalTok{index,])} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{population[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{==}\NormalTok{index,]} -\NormalTok{ prediction}\OperatorTok{$}\NormalTok{value <-}\StringTok{ }\NormalTok{pred} - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{)} -\NormalTok{ aucVal <-}\StringTok{ }\KeywordTok{computeAuc}\NormalTok{(prediction)} -\NormalTok{ perform <-}\StringTok{ }\KeywordTok{c}\NormalTok{(perform,aucVal)} - - \CommentTok{# add the fold predictions and compute AUC after loop} -\NormalTok{ predictionMat}\OperatorTok{$}\NormalTok{value[population}\OperatorTok{$}\NormalTok{indexes}\OperatorTok{==}\NormalTok{index] <-}\StringTok{ }\NormalTok{pred} - -\NormalTok{ \}} - \CommentTok{##auc <- mean(perform) # want overal rather than mean} -\NormalTok{ auc <-}\StringTok{ }\KeywordTok{computeAuc}\NormalTok{(predictionMat)} - -\NormalTok{ foldPerm <-}\StringTok{ }\NormalTok{perform} -\NormalTok{ \} }\ControlFlowTok{else}\NormalTok{ \{} -\NormalTok{ model <-}\StringTok{ }\NormalTok{madeup}\OperatorTok{::}\KeywordTok{model}\NormalTok{(}\DataTypeTok{x=}\NormalTok{ data, } - \DataTypeTok{y=}\NormalTok{ population}\OperatorTok{$}\NormalTok{outcomeCount,} - \DataTypeTok{a=}\NormalTok{a,}\DataTypeTok{b=}\NormalTok{b)} - -\NormalTok{ pred <-}\StringTok{ }\NormalTok{stats}\OperatorTok{::}\KeywordTok{predict}\NormalTok{(model, data)} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{population} -\NormalTok{ prediction}\OperatorTok{$}\NormalTok{value <-}\StringTok{ }\NormalTok{pred} - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{) } -\NormalTok{ auc <-}\StringTok{ }\KeywordTok{computeAuc}\NormalTok{(prediction)} -\NormalTok{ foldPerm <-}\StringTok{ }\NormalTok{auc} -\NormalTok{ \}} - -\NormalTok{ result <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{model=}\NormalTok{model,} - \DataTypeTok{auc=}\NormalTok{auc,} - \DataTypeTok{hyperSum =} \KeywordTok{unlist}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DataTypeTok{a =}\NormalTok{ a, }\DataTypeTok{b =}\NormalTok{ b, }\DataTypeTok{fold_auc=}\NormalTok{foldPerm))} -\NormalTok{ )} - \KeywordTok{return}\NormalTok{(result)} -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{predict-1}{% -\subsection{Predict}\label{predict-1}} - -The final step is to create a predict function for the model. This gets -added to the predict.R file. In the example above the type -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}madeup\textquotesingle{}} -was madeup, so a \texttt{predict.madeup} function is required to be -added into the predict.R. The predict function needs to take as input -the plpModel returned by the fit function, the population to apply the -model on and the plpData specifying the covariates of the population. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{predict.madeup <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(plpModel,population, plpData, ...)\{ } -\NormalTok{ result <-}\StringTok{ }\KeywordTok{toSparseM}\NormalTok{(plpData, population, }\DataTypeTok{map=}\NormalTok{plpModel}\OperatorTok{$}\NormalTok{covariateMap)} -\NormalTok{ data <-}\StringTok{ }\NormalTok{result}\OperatorTok{$}\NormalTok{data[population}\OperatorTok{$}\NormalTok{rowId,]} -\NormalTok{ prediction <-}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{rowId=}\NormalTok{population}\OperatorTok{$}\NormalTok{rowId, } - \DataTypeTok{value=}\NormalTok{stats}\OperatorTok{::}\KeywordTok{predict}\NormalTok{(plpModel}\OperatorTok{$}\NormalTok{model, data)} -\NormalTok{ )} - -\NormalTok{ prediction <-}\StringTok{ }\KeywordTok{merge}\NormalTok{(population, prediction, }\DataTypeTok{by=}\StringTok{'rowId'}\NormalTok{)} -\NormalTok{ prediction <-}\StringTok{ }\NormalTok{prediction[,}\KeywordTok{colnames}\NormalTok{(prediction)}\OperatorTok{%in%} -\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{'rowId'}\NormalTok{,}\StringTok{'outcomeCount'}\NormalTok{,}\StringTok{'indexes'}\NormalTok{, }\StringTok{'value'}\NormalTok{)] }\CommentTok{# need to fix no index issue} - \KeywordTok{attr}\NormalTok{(prediction, }\StringTok{"metaData"}\NormalTok{) <-}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DataTypeTok{predictionType =} \StringTok{"binary"}\NormalTok{) } - \KeywordTok{return}\NormalTok{(prediction)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -As the madeup model uses the standard R prediction, it has the same -prediction function as xgboost, so we could have not added a new -prediction function and instead made the type of the result returned by -fitMadeUpModel to -\texttt{attr(result,\ \textquotesingle{}type\textquotesingle{})\ \textless{}-\ \textquotesingle{}xgboost\textquotesingle{}}. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -This work is supported in part through the National Science Foundation -grant IIS 1251151. - -\end{document} diff --git a/inst/doc/AddingCustomFeatureEngineering.pdf b/inst/doc/AddingCustomFeatureEngineering.pdf index 4d97db3c5..e4e8220ce 100644 Binary files a/inst/doc/AddingCustomFeatureEngineering.pdf and b/inst/doc/AddingCustomFeatureEngineering.pdf differ diff --git a/inst/doc/BuildingDeepLearningModels.pdf b/inst/doc/BuildingDeepLearningModels.pdf deleted file mode 100644 index a496daccb..000000000 Binary files a/inst/doc/BuildingDeepLearningModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingDeepLearningModels.tex b/inst/doc/BuildingDeepLearningModels.tex deleted file mode 100644 index 8917b0849..000000000 --- a/inst/doc/BuildingDeepLearningModels.tex +++ /dev/null @@ -1,741 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Building Deep Learning Models}, - pdfauthor={Peter R. Rijnbeek, Seng Chan You, Xiaoyong Pan, Jenna Reps}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs} -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Building Deep Learning Models} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Building Deep Learning Models} -\author{Peter R. Rijnbeek, Seng Chan You, Xiaoyong Pan, Jenna Reps} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Electronic Health Records (EHR) data is high dimensional, heterogeneous, -and sparse, which makes predictive modelling a challenge. In the early -days, the machine learning community mainly focused on algorithm -development, currently there is a shift to more powerful feature -engineering. Deep Learning models are widely used to automatically learn -high-level feature representations from the data, and have achieved -remarkable results in image processing, speech recognition and -computational biology. Recently, interesting results have been shown -using EHRs, but more extensive research is needed to assess the power of -Deep Learning in this domain. - -This vignette describes how you can use the Observational Health Data -Sciences and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to build Deep Learning models. This vignette assumes you have -read and are comfortable with building patient level prediction models -as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. Furthermore, this vignette assumes you are familiar with Deep -Learning methods. - -\hypertarget{background}{% -\section{Background}\label{background}} - -Deep Learning models are build by stacking an often large number of -neural network layers that perform feature engineering steps, e.g -embedding, and are collapsed in a final softmax layer (basically a -logistic regression layer). These algorithms need a lot of data to -converge to a good representation, but currently the sizes of the EHR -databases are growing fast which would make Deep Learning an interesting -approach to test within OHDSI's -\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{Patient-Level -Prediction Framework}. The current implementation allows us to perform -research at scale on the value and limitations of Deep Learning using -EHR data. For relatively small Target and Outcome cohorts, Deep Learning -is most probably not the best choice. - -Most current Deep Learning research is performed in python and we have -developed a pipeline to interact with python. Multiple Deep Learning -backends have been developed, e.g.~Tensorflow, PyTorch, Keras (recently -also available in R) etc. In the package we have implemented interaction -with Keras in R and PyTorch in Python but we invite the community to add -other backends. - -Many network architectures have recently been proposed and we have -implemented a number of them, however, this list will grow in the near -future. It is important to understand that some of these architectures -require a 2D data matrix, -i.e.~\textbar patient\textbar x\textbar feature\textbar, and others use -a 3D data matrix -\textbar patient\textbar x\textbar feature\textbar x\textbar time\textbar. -The \href{www.github.com/ohdsi/FeatureExtraction}{FeatureExtraction -Package} has been extended to enable the extraction of both data formats -as will be described with examples below. - -Note that training Deep Learning models is computationally intensive, -our implementation therefore supports both GPU and CPU. It will -automatically check whether there is GPU or not in your computer. A GPU -is highly recommended for Deep Learning! - -\hypertarget{non-temporal-architectures}{% -\section{Non-Temporal Architectures}\label{non-temporal-architectures}} - -We implemented the following non-temporal (2D data matrix) architectures -using PyTorch: - -\begin{verbatim} -1) Logistics regression (LRTorch) - A simple softmax layer with l2 regularization - -2) Feed forward network (MLPTorch) - Supports multilayer perceptron (mlp_type = MLP) and - Self-Normalizing Neural Networks (mlp_type = SNN) - Reference: https://arxiv.org/abs/1706.02515 -\end{verbatim} - -For the above two methods, we implemented support for a stacked -autoencoder and a variational autoencoder to reduce the feature -dimension as a first step. These autoencoders learn efficient data -encodings in an unsupervised manner by stacking multiple layers in a -neural network. Compared to the standard implementations of LR and MLP -these implementations can use the GPU power to speed up the gradient -descent approach in the back propagation to optimize the weights of the -classifier. - -Table 1: Non-Temporal Deep Learning Models Hyper-Parameters - -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.10\columnwidth}\raggedright -Name\strut -\end{minipage} & \begin{minipage}[b]{0.34\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.47\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.10\columnwidth}\raggedright -LRTorch\strut -\end{minipage} & \begin{minipage}[t]{0.34\columnwidth}\raggedright -Logistic Regression Model\strut -\end{minipage} & \begin{minipage}[t]{0.47\columnwidth}\raggedright -w\_decay (l2 regularization), epochs (number of epochs), class\_weight -(0 = inverse ratio between number of positive and negative examples, -1 -= focal loss (\url{https://arxiv.org/abs/1708.02002}), or other), -autoencoder (apply stacked autoencoder?, vae (apply variational -autoencoder)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.10\columnwidth}\raggedright -MLPTorch\strut -\end{minipage} & \begin{minipage}[t]{0.34\columnwidth}\raggedright -Multi-Layer Perceptron Model\strut -\end{minipage} & \begin{minipage}[t]{0.47\columnwidth}\raggedright -mlp\_type (MLP = default, SNN = self-normalizing neural network), size -(number of hidden nodes), w\_decay (l2 regularization), epochs (number -of epochs), class\_weight(0 = inverse ratio between number of positive -and negative examples, -1 = focal loss, or other), autoencoder (apply -stacked autoencoder), vae (apply variational autoencoder?)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\#\#Example The approach for logistic regression (LRTorch) and the -Multi-Layer Perceptron (MLPTorch) is identical. Here we will take -LRTorch as an example. - -You need to generate a \texttt{population} and \texttt{plpData} object -as described in more detail in -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Alternatively, you can make use of the data simulator. The following -code snippet creates a population of 12000 patients. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)} -\KeywordTok{data}\NormalTok{(plpDataSimulationProfile)} -\NormalTok{sampleSize <-}\StringTok{ }\DecValTok{12000} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{simulatePlpData}\NormalTok{(} -\NormalTok{ plpDataSimulationProfile,} - \DataTypeTok{n =}\NormalTok{ sampleSize} -\NormalTok{)} - -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(} -\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{binary =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -As an example we will build a LRTorch model. We could specify the -stacked autoencoder or the variational autoencoder to be used for -reducing the feature dimension as an initial layer, but for this example -we do not. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{autoencoder <-}\StringTok{ }\OtherTok{FALSE} -\NormalTok{vae <-}\StringTok{ }\OtherTok{FALSE} -\end{Highlighting} -\end{Shaded} - -We added a class\_weight for imbalanced data, the default value 0 is the -inverse ratio between negatives and positives,-1 applies focal loss. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{class_weight <-}\StringTok{ }\DecValTok{0} -\end{Highlighting} -\end{Shaded} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Specify the settings for Logistics regression model using Torch in Python} -\NormalTok{model <-}\StringTok{ }\KeywordTok{setLRTorch}\NormalTok{(}\DataTypeTok{autoencoder=}\NormalTok{autoencoder, }\DataTypeTok{vae=}\NormalTok{vae, }\DataTypeTok{class_weight=}\NormalTok{class_weight)} -\end{Highlighting} -\end{Shaded} - -No we define our modelling parameters. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{testFraction <-}\StringTok{ }\FloatTok{0.2} -\NormalTok{testSplit <-}\StringTok{ 'person'} -\NormalTok{nfold <-}\StringTok{ }\DecValTok{3} -\NormalTok{splitSeed <-}\StringTok{ }\DecValTok{1000} -\end{Highlighting} -\end{Shaded} - -And we train and internally validate the model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{results <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{runPlp}\NormalTok{(}\DataTypeTok{population =}\NormalTok{ population, } - \DataTypeTok{plpData =}\NormalTok{ plpData, } - \DataTypeTok{modelSettings =}\NormalTok{ model,} - \DataTypeTok{testSplit=}\NormalTok{testSplit,} - \DataTypeTok{testFraction=}\NormalTok{testFraction,} - \DataTypeTok{nfold=}\NormalTok{nfold, } - \DataTypeTok{splitSeed=}\NormalTok{splitSeed) } -\end{Highlighting} -\end{Shaded} - -\hypertarget{temporal-architectures}{% -\section{Temporal Architectures}\label{temporal-architectures}} - -Several architectures are implemented that can handle temporal data in -PyTorch and R Keras. - -\hypertarget{pytorch-cnn}{% -\subsection{PyTorch CNN}\label{pytorch-cnn}} - -We implemented the following \textbf{convolutional} models described in -\url{https://github.com/clinicalml/deepDiagnosis} in CNNTorch: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Temporal Convolutional neural network over a backward window (type = - cnn) - - \includegraphics{arch1.png} -\item - Convolutional neural network over input and time dimension (type = - mix) - - \includegraphics{conv_arch2.png} -\item - Multi-resolution temporal convolutional neural network (type = multi) - - \includegraphics{conv_arch1.png} -\end{enumerate} - -Furthermore, we added the following achitectures: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{3} -\item - CNN with filters with three different parallel kernel sizes (3,4,5) - and a fully connected layers (type = mlf) - - \includegraphics{cnn_mlf2.png} -\item - LSTM network over the backward window (type = lstm) - - \includegraphics{cnn_lstm.png} -\item - Residual Learning Network as described in: - \url{https://arxiv.org/abs/1512.03385} (type = resnet) - - This a very big network, see the paper for the topology. -\end{enumerate} - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.26\columnwidth}\raggedright -parameter\strut -\end{minipage} & \begin{minipage}[b]{0.68\columnwidth}\raggedright -description\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.26\columnwidth}\raggedright -nbfilters\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of convolution filters\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -epochs\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of epochs\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -seed\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -Random seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -class\_weight\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The class weight used for imbalanced data (0: Inverse ratio between -positives and negatives, -1: Focal loss, or number)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\hypertarget{pytorch-rnn}{% -\subsection{PyTorch RNN}\label{pytorch-rnn}} - -The following \textbf{recurrent neural network} models are implemented -in RNNTorch: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - RNN with one LSTM layer fed into one fully connected layer (type = - RNN) - - \includegraphics{lstm_last.png} -\item - RNN with one bidirectional LSTM layer fed into one fully connected - layer (type = BiRNN) - - This network looks the same as above but then as a bi-directional - version -\item - One Gated Recurrent Unit layer fed into one fully connected layers - (type = GRU) - - This network looks the same as above but then implemented as GRU -\end{enumerate} - -The following hyper-parameters can be set for these PyTorch models: - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.26\columnwidth}\raggedright -parameter\strut -\end{minipage} & \begin{minipage}[b]{0.68\columnwidth}\raggedright -description\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.26\columnwidth}\raggedright -hidden\_size\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of features in hidden state\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -epochs\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The number of epochs\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -seed\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -Random seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -class\_weight\strut -\end{minipage} & \begin{minipage}[t]{0.68\columnwidth}\raggedright -The class weight used for imbalanced data (0: Inverse ratio between -positives and negatives, -1: Focal loss, or number)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\newpage - -\hypertarget{r-keras-cnn}{% -\subsection{R Keras CNN}\label{r-keras-cnn}} - -The following temporal architectures as described in -\url{https://arxiv.org/pdf/1608.00647.pdf} were implemented using R -Keras: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\item - Multi-resolution CovNN model (CovNN.R) - - \includegraphics{covcnn.png} -\item - Convolution across data and time according(CovNN2.R) - - \includegraphics{covcnn2.png} - - \newpage -\end{enumerate} - -Furthermore, a custom build RNN is added that uses a variational -autoencoder. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\setcounter{enumi}{2} -\item - Clinically Informing application based on Recurrent Neural Network - (CIReNN.R) - - \includegraphics{cirenn.png} -\end{enumerate} - -Table 2: Temporal Deep Learning Models - -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.11\columnwidth}\raggedright -Model\strut -\end{minipage} & \begin{minipage}[b]{0.83\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.11\columnwidth}\raggedright -CovNN\strut -\end{minipage} & \begin{minipage}[t]{0.83\columnwidth}\raggedright -batchSize (The number of samples to used in each batch during model -training), outcomeWeight (The weight assigned to the outcome), lr (The -learning rate), decay (The decay of the learning rate), dropout -({[}currently not used{]} the dropout rate for regularization), epochs -(The number of times data is used to train the model, e.g., epoches=1 -means data only used once to train), filters (The number of columns -output by each convolution), kernelSize (The number of time dimensions -used for each convolution), loss (The loss function implemented), seed -(The random seed)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -CovNN2\strut -\end{minipage} & \begin{minipage}[t]{0.83\columnwidth}\raggedright -batchSize (The number of samples to used in each batch during model -training), outcomeWeight (The weight assigned to the outcome), lr (The -learning rate), decay (The decay of the learning rate), dropout -({[}currently not used{]} the dropout rate for regularization), epochs -(The number of times data is used to train the model, e.g., epoches=1 -means data only used once to train), filters (The number of columns -output by each convolution), kernelSize (The number of time dimensions -used for each convolution), loss (The loss function implemented), seed -(The random seed)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -CIReNN\strut -\end{minipage} & \begin{minipage}[t]{0.83\columnwidth}\raggedright -units (The number of units of RNN layer - as a list of vectors), -recurrentDropout (The reccurrent dropout rate), layerDropout (The layer -dropout rate), lr (Learning rate), decay (Learning rate decay over each -update), outcomeWeight (The weight of the outcome class in the loss -function), batchSize (The number of data points to use per training -batch), epochs (Number of times to iterate over data set), -earlyStoppingMinDelta (Minimum change in the monitored quantity to -qualify as an improvement for early stopping, i.e.~an absolute change of -less than min\_delta in loss of validation data, will count as no -improvement), earlyStoppingPatience (Number of epochs with no -improvement after which training will be stopped), seed (Random seed -used by Deep Learning model)\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\hypertarget{example}{% -\subsection{Example}\label{example}} - -We will now show how to use the temporal models by using CNNTorch as an -example. - -You need to generate a \texttt{population} and \texttt{plpData} object -as described in more detail in -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Note that for these algorithms you need to extracted temporal data as -described in the {[}FeatureExtraction vignette{]} -(\url{https://github.com/OHDSI/FeatureExtraction/blob/master/inst/doc/UsingFeatureExtraction.pdf}) -as follows: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{settings <-}\StringTok{ }\KeywordTok{createTemporalCovariateSettings}\NormalTok{(}\DataTypeTok{useConditionEraStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionEraOverlap =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionOccurrence =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionEraGroupStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useConditionEraGroupOverlap =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDrugExposure =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDrugEraStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDrugEraOverlap =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useMeasurement =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useMeasurementValue =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useMeasurementRangeGroup =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useProcedureOccurrence =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useDeviceExposure =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{useObservation =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{excludedCovariateConceptIds =} \KeywordTok{c}\NormalTok{(}\DecValTok{316866}\NormalTok{),} - \DataTypeTok{addDescendantsToExclude =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{temporalStartDays =} \KeywordTok{seq}\NormalTok{(}\DataTypeTok{from =} \DecValTok{-365}\NormalTok{, } - \DataTypeTok{to =} \DecValTok{-1}\NormalTok{, }\DataTypeTok{by =} \DecValTok{12}\NormalTok{), } - \DataTypeTok{temporalEndDays =} \KeywordTok{c}\NormalTok{(}\KeywordTok{seq}\NormalTok{(}\DataTypeTok{from =} \DecValTok{-353}\NormalTok{, } - \DataTypeTok{to =} \DecValTok{0}\NormalTok{, }\DataTypeTok{by =} \DecValTok{12}\NormalTok{), }\DecValTok{0}\NormalTok{))} - -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{getPlpData}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{"results"}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{"cohort"}\NormalTok{,} - \DataTypeTok{cohortId =} \DecValTok{11}\NormalTok{,} - \DataTypeTok{covariateSettings =}\NormalTok{ settings,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{"cohort"}\NormalTok{,} - \DataTypeTok{outcomeIds =} \DecValTok{25}\NormalTok{,} - \DataTypeTok{cdmVersion =} \DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Each CNN/RNN has several hyper-parameters that can be set as shown in -the Tables above, but for this example we take the defaults. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# specify the the CNN} -\NormalTok{model <-}\StringTok{ }\KeywordTok{setCNNTorch}\NormalTok{(}\DataTypeTok{cnn_type=}\StringTok{'CNN'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Run the model training, for example with a testFraction = 0.2 and a -split by person: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{results <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{runPlp}\NormalTok{(population, plpData, model,} - \DataTypeTok{testSplit=}\StringTok{'person'}\NormalTok{,} - \DataTypeTok{testFraction=}\FloatTok{0.2}\NormalTok{,} - \DataTypeTok{nfold=}\DecValTok{3}\NormalTok{, } - \DataTypeTok{splitSeed=}\DecValTok{1000}\NormalTok{) } -\end{Highlighting} -\end{Shaded} - -\hypertarget{apply-the-trained-deep-learning-model}{% -\section{Apply the trained Deep Learning -model}\label{apply-the-trained-deep-learning-model}} - -Applying a Deep Learning is identical to the other models in the -package: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{""}\NormalTok{)} - -\CommentTok{# load the new plpData (should have the same temporal features!) and create the population} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{loadPlpData}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{""}\NormalTok{)} - -\NormalTok{populationSettings <-}\StringTok{ }\NormalTok{plpModel}\OperatorTok{$}\NormalTok{populationSettings} -\NormalTok{populationSettings}\OperatorTok{$}\NormalTok{plpData <-}\StringTok{ }\NormalTok{plpData} -\NormalTok{population <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(createStudyPopulation, populationSettings) } - -\CommentTok{# apply the trained model on the new data} -\NormalTok{validationResults <-}\StringTok{ }\KeywordTok{applyModel}\NormalTok{(population, plpData, plpModel)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{adding-new-architectures}{% -\section{Adding new architectures}\label{adding-new-architectures}} - -It is possible to add new architectures in our framework using PyTorch -or R Keras. We are happy to help you with this, please post your -questions on the -\href{www.github.com/OHDSI/PatientLevelPrediction/issues}{issue tracker} -of the package. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/BuildingEnsembleModels.pdf b/inst/doc/BuildingEnsembleModels.pdf deleted file mode 100644 index 37e714ebf..000000000 Binary files a/inst/doc/BuildingEnsembleModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingEnsembleModels.tex b/inst/doc/BuildingEnsembleModels.tex deleted file mode 100644 index 85bc73667..000000000 --- a/inst/doc/BuildingEnsembleModels.tex +++ /dev/null @@ -1,369 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Building Ensemble Models}, - pdfauthor={Xiaoyong Pan, Jenna Reps, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Building Ensemble Models} -\author{Xiaoyong Pan, Jenna Reps, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Ensemble models combine several models to improve the overall -performance. Traditionally, weak learners were combined to boost -performance but recent results show that combining several strong -approaches can also result in a better performance. There are many -examples in literature where ensemble models outperform individual -models using stacking, i.e.~a final logistic regresssion layer accross -the individual model outputs, but other approaches like weigthing has -also shown promising results. - -This vignette describes how you can use the Observational Health Data -Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to build ensemble models. This vignette assumes you have read -and are comfortable with building single patient level prediction models -as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -This will enable studying ensemble methods at scale in the OHDSI data -network. - -\begin{figure} -\centering -\includegraphics{ensemble.png} -\caption{Ensemble model} -\end{figure} - -In PatientLevelPrediction package, four ensemble strategies have been -implemented: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - average ensemble: Calculate the average probability from individual - models -\item - product ensemble: Calculate the product of probabilites from - individual models. -\item - weighted ensemble: Calculate the weighted average probability from - individual models using train AUC as weights. -\item - stacked ensemble: Train a logistics regression on outputs from - individual models -\end{enumerate} - -\hypertarget{usage}{% -\section{Usage}\label{usage}} - -Use the -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to generate a \texttt{population} and \texttt{plpData} object. -Alternatively, you can make use of the data simulator. The following -code snippet creates a population of 12000 patients. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{data}\NormalTok{(plpDataSimulationProfile)} -\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)} -\NormalTok{sampleSize <-}\StringTok{ }\DecValTok{2000} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{simulatePlpData}\NormalTok{(} -\NormalTok{ plpDataSimulationProfile,} - \DataTypeTok{n =}\NormalTok{ sampleSize} -\NormalTok{)} - -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(} -\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{binary =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Specify the prediction algorithms to be combined. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use LASSO logistic regression and Random Forest as base predictors} -\NormalTok{model1 <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\NormalTok{model2 <-}\StringTok{ }\KeywordTok{setRandomForest}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -Specify a test fraction and a sequence of training set fractions. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{testFraction <-}\StringTok{ }\FloatTok{0.2} -\end{Highlighting} -\end{Shaded} - -Specify an ensembleStrategy to combine multiple predictors. The strategy -used for ensembling the outputs from different models, it can be `mean', -`product', `weighted' and `stacked': `mean' the average probability from -differnt models `product' the product rule `weighted' the weighted -average probability from different models using train AUC as weights. -`stacked' the stakced ensemble trains a logistics regression on -different models. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ensembleStrategy <-}\StringTok{ 'stacked'} -\end{Highlighting} -\end{Shaded} - -Specify the test split to be used. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use a split by person, alterantively a time split is possible} -\NormalTok{testSplit <-}\StringTok{ 'person'} -\end{Highlighting} -\end{Shaded} - -Run the ensemble learning to combine model1 and model2. You can also use -different plpData for different models. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ensembleResults <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{runEnsembleModel}\NormalTok{(population, } - \DataTypeTok{dataList =} \KeywordTok{list}\NormalTok{(plpData, plpData), } - \DataTypeTok{modelList =} \KeywordTok{list}\NormalTok{(model1, model2),} - \DataTypeTok{testSplit=}\NormalTok{testSplit,} - \DataTypeTok{testFraction=}\NormalTok{testFraction,} - \DataTypeTok{nfold=}\DecValTok{3}\NormalTok{, }\DataTypeTok{splitSeed=}\DecValTok{1000}\NormalTok{, } - \DataTypeTok{ensembleStrategy =}\NormalTok{ ensembleStrategy) } -\end{Highlighting} -\end{Shaded} - -\hypertarget{saving-and-loading-the-ensemble-model}{% -\subsection{Saving and loading the ensemble -model}\label{saving-and-loading-the-ensemble-model}} - -You can save and load the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{saveEnsemblePlpModel}\NormalTok{(ensembleResults}\OperatorTok{$}\NormalTok{model, }\DataTypeTok{dirPath =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\NormalTok{ensembleModel <-}\StringTok{ }\KeywordTok{loadEnsemblePlpModel}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{apply-ensemble-model}{% -\section{Apply Ensemble model}\label{apply-ensemble-model}} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{loadPlpData}\NormalTok{(}\StringTok{""}\NormalTok{)} -\NormalTok{populationSettings <-}\StringTok{ }\NormalTok{ensembleModel}\OperatorTok{$}\NormalTok{populationSettings} -\NormalTok{populationSettings}\OperatorTok{$}\NormalTok{plpData <-}\StringTok{ }\NormalTok{plpData} -\NormalTok{population <-}\StringTok{ }\KeywordTok{do.call}\NormalTok{(createStudyPopulation, populationSettings)} -\end{Highlighting} -\end{Shaded} - -Load the model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ensembleModel <-}\StringTok{ }\KeywordTok{loadEnsemblePlpModel}\NormalTok{(}\StringTok{""}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Get the predictions by applying the model: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{prediction <-}\StringTok{ }\KeywordTok{applyEnsembleModel}\NormalTok{(population,} - \DataTypeTok{dataList =} \KeywordTok{list}\NormalTok{(plpData, plpData),} - \DataTypeTok{ensembleModel =}\NormalTok{ ensembleModel)}\OperatorTok{$}\NormalTok{prediction} -\end{Highlighting} -\end{Shaded} - -\hypertarget{demo}{% -\section{Demo}\label{demo}} - -We have added a demo of the ensemble training: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Show all demos in our package: } - \KeywordTok{demo}\NormalTok{(}\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} - -\CommentTok{# Run the learning curve} - \KeywordTok{demo}\NormalTok{(}\StringTok{"EnsembleModelDemo"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/BuildingMultiplePredictiveModels.tex b/inst/doc/BuildingMultiplePredictiveModels.tex deleted file mode 100644 index a4e00e6a2..000000000 --- a/inst/doc/BuildingMultiplePredictiveModels.tex +++ /dev/null @@ -1,449 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Automatically Build Multiple Patient-Level Predictive Models}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Automatically Build Multiple Patient-Level Predictive Models} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Automatically Build Multiple Patient-Level Predictive Models} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -In our -\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{\texttt{paper}}, -we propose a standardised framework for patient-level prediction that -utilizes the OMOP CDM and standardized vocabularies, and describe the -open-source software that we developed implementing the framework's -pipeline. The framework is the first to enforce existing best practice -guidelines and will enable open dissemination of models that can be -extensively validated across the network of OHDSI collaborators. - -One our best practices is that we see the selection of models and all -study setting as an emperical question, i.e.~we should use a data-driven -approach in which we try many settings. This vignette describes how you -can use the Observational Health Data Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to automatically build multiple patient-level predictive models, -e.g.~different population settings, covariate settings, and -modelsetting. This vignette assumes you have read and are comfortable -with building single patient level prediction models as described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Note that it is also possible to generate a Study Package directly in -Atlas that allows for multiple patient-level prediction analyses this is -out-of-scope for this vignette. - -\hypertarget{creating-the-setting-lists}{% -\section{Creating the setting lists}\label{creating-the-setting-lists}} - -To develop multiple models the user has to create a list of Study -Populations Settings, Covariate Settings, and Model Settings. These -lists will then be combined in a Model Analysis List and all -combinations of the elements in this list will be automatically run by -the package. - -\hypertarget{study-population-settings}{% -\subsection{Study population settings}\label{study-population-settings}} - -Suppose we like to make the following three population settings: - -\begin{itemize} -\tightlist -\item - study population 1: allows persons who have the outcome to leave the - database before the end of time-at-risk and only those without the - outcome who are observed for the whole time-at-risk period - (requireTimeAtRisk = T). -\item - study population 2: does not impose the restriction that persons who - do not experience the outcome need to be observed for the full - time-at-risk period (requireTimeAtRisk = F). -\item - study population 3: does impose the restriction that persons who do - not experience the outcome need to be observed for the full - time-at-risk period (requireTimeAtRisk = T) and allows persons that - had the outcome before (removeSubjectsWithPriorOutcome = F) -\end{itemize} - -The create a study population setting list we use the function -\texttt{createStudyPopulationSettings} as described below: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# define all study population settings} -\NormalTok{studyPop1 <-}\StringTok{ }\KeywordTok{createStudyPopulationSettings}\NormalTok{(}\DataTypeTok{binary =}\NormalTok{ T,} - \DataTypeTok{includeAllOutcomes =}\NormalTok{ F,} - \DataTypeTok{removeSubjectsWithPriorOutcome =}\NormalTok{ T,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =}\NormalTok{ T,} - \DataTypeTok{minTimeAtRisk=}\DecValTok{364}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} - -\NormalTok{studyPop2 <-}\StringTok{ }\KeywordTok{createStudyPopulationSettings}\NormalTok{(}\DataTypeTok{binary =}\NormalTok{ T,} - \DataTypeTok{includeAllOutcomes =}\NormalTok{ F,} - \DataTypeTok{removeSubjectsWithPriorOutcome =}\NormalTok{ T,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =}\NormalTok{ F,} - \DataTypeTok{minTimeAtRisk=}\DecValTok{364}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} - -\NormalTok{studyPop3 <-}\StringTok{ }\KeywordTok{createStudyPopulationSettings}\NormalTok{(}\DataTypeTok{binary =}\NormalTok{ T,} - \DataTypeTok{includeAllOutcomes =}\NormalTok{ F,} - \DataTypeTok{removeSubjectsWithPriorOutcome =}\NormalTok{ F,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =}\NormalTok{ T,} - \DataTypeTok{minTimeAtRisk=}\DecValTok{364}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} - -\CommentTok{# combine these in a population setting list} -\NormalTok{populationSettingList <-}\StringTok{ }\KeywordTok{list}\NormalTok{(studyPop1,studyPop2,studyPop3)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{covariate-settings}{% -\subsection{Covariate settings}\label{covariate-settings}} - -The covariate settings are created using -\texttt{createCovariateSettings}. We can create multiple covariate -settings and then combine them in a list: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{covSet1 <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =}\NormalTok{ T, } - \DataTypeTok{useDemographicsAgeGroup =}\NormalTok{ T, } - \DataTypeTok{useConditionGroupEraAnyTimePrior =}\NormalTok{ T,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =}\NormalTok{ T)} - -\NormalTok{covSet2 <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =}\NormalTok{ T, } - \DataTypeTok{useDemographicsAgeGroup =}\NormalTok{ T, } - \DataTypeTok{useConditionGroupEraAnyTimePrior =}\NormalTok{ T,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =}\NormalTok{ F)} - -\NormalTok{covariateSettingList <-}\StringTok{ }\KeywordTok{list}\NormalTok{(covSet1, covSet2)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{algorithm-settings}{% -\subsection{Algorithm settings}\label{algorithm-settings}} - -The model settings requires running the setModel functions for the -machine learning algorithms of interest and specifying the -hyper-parameter search and then combining these into a list. For -example, if we wanted to try a logistic regression, gradient boosting -machine and ada boost model then: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{gbm <-}\StringTok{ }\KeywordTok{setGradientBoostingMachine}\NormalTok{()} -\NormalTok{lr <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\NormalTok{ada <-}\StringTok{ }\KeywordTok{setAdaBoost}\NormalTok{()} - -\NormalTok{modelList <-}\StringTok{ }\KeywordTok{list}\NormalTok{(gbm, lr, ada)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{model-analysis-list}{% -\subsection{Model analysis list}\label{model-analysis-list}} - -To create the complete plp model settings use -\texttt{createPlpModelSettings} to combine the population, covariate and -model settings. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{modelAnalysisList <-}\StringTok{ }\KeywordTok{createPlpModelSettings}\NormalTok{(}\DataTypeTok{modelList =}\NormalTok{ modelList, } - \DataTypeTok{covariateSettingList =}\NormalTok{ covariateSettingList,} - \DataTypeTok{populationSettingList =}\NormalTok{ populationSettingList)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{running-multiple-models}{% -\section{Running multiple models}\label{running-multiple-models}} - -As we will be downloading loads of data in the multiple plp analysis it -is useful to set the Andromeda temp folder to a directory with write -access and plenty of space. -\texttt{options(andromedaTempFolder\ =\ "c:/andromedaTemp")} - -To run the study requires setting up a connectionDetails object - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{dbms <-}\StringTok{ "your dbms"} -\NormalTok{user <-}\StringTok{ "your username"} -\NormalTok{pw <-}\StringTok{ "your password"} -\NormalTok{server <-}\StringTok{ "your server"} -\NormalTok{port <-}\StringTok{ "your port"} - -\NormalTok{connectionDetails <-}\StringTok{ }\NormalTok{DatabaseConnector}\OperatorTok{::}\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =}\NormalTok{ dbms,} - \DataTypeTok{server =}\NormalTok{ server,} - \DataTypeTok{user =}\NormalTok{ user,} - \DataTypeTok{password =}\NormalTok{ pw,} - \DataTypeTok{port =}\NormalTok{ port)} -\end{Highlighting} -\end{Shaded} - -Next you need to specify the cdmDatabaseSchema where your cdm database -is found and workDatabaseSchema where your target population and outcome -cohorts are and you need to specify a label for the database name: a -string with a shareable name of the database (this will be shown to -OHDSI researchers if the results get transported). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{cdmDatabaseSchema <-}\StringTok{ "your cdmDatabaseSchema"} -\NormalTok{workDatabaseSchema <-}\StringTok{ "your workDatabaseSchema"} -\NormalTok{cdmDatabaseName <-}\StringTok{ "your cdmDatabaseName"} -\end{Highlighting} -\end{Shaded} - -Now you can run the multiple patient-level prediction analysis by -specifying the target cohort ids and outcome ids - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{allresults <-}\StringTok{ }\KeywordTok{runPlpAnalyses}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cdmDatabaseName =}\NormalTok{ cdmDatabaseName,} - \DataTypeTok{oracleTempSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ workDatabaseSchema,} - \DataTypeTok{cohortTable =} \StringTok{"your cohort table"}\NormalTok{,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ workDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{"your cohort table"}\NormalTok{,} - \DataTypeTok{cdmVersion =} \DecValTok{5}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{"./PlpMultiOutput"}\NormalTok{,} - \DataTypeTok{modelAnalysisList =}\NormalTok{ modelAnalysisList,} - \DataTypeTok{cohortIds =} \KeywordTok{c}\NormalTok{(}\DecValTok{2484}\NormalTok{,}\DecValTok{6970}\NormalTok{),} - \DataTypeTok{cohortNames =} \KeywordTok{c}\NormalTok{(}\StringTok{'visit 2010'}\NormalTok{,}\StringTok{'test cohort'}\NormalTok{),} - \DataTypeTok{outcomeIds =} \KeywordTok{c}\NormalTok{(}\DecValTok{7331}\NormalTok{,}\DecValTok{5287}\NormalTok{),} - \DataTypeTok{outcomeNames =} \KeywordTok{c}\NormalTok{(}\StringTok{'outcome 1'}\NormalTok{,}\StringTok{'outcome 2'}\NormalTok{),} - \DataTypeTok{maxSampleSize =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{minCovariateFraction =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{normalizeData =}\NormalTok{ T,} - \DataTypeTok{testSplit =} \StringTok{"stratified"}\NormalTok{,} - \DataTypeTok{testFraction =} \FloatTok{0.25}\NormalTok{,} - \DataTypeTok{splitSeed =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{nfold =} \DecValTok{3}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This will then save all the plpData objects from the study into -``./PlpMultiOutput/plpData'', the populations for the analysis into -``./PlpMultiOutput/population'' and the results into -``./PlpMultiOutput/Result''. The csv named settings.csv found in -``./PlpMultiOutput'' has a row for each prediction model developed and -points to the plpData and population used for the model development, it -also has descriptions of the cohorts and settings if these are input by -the user. - -Note that if for some reason the run is interrupted, e.g.~because of an -error, a new call to \texttt{RunPlpAnalyses} will continue and not -restart until you remove the output folder. - -\hypertarget{validating-multiple-models}{% -\section{Validating multiple models}\label{validating-multiple-models}} - -If you have access to multiple databases on the same server in different -schemas you could evaluate accross these using this call: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{val <-}\StringTok{ }\KeywordTok{evaluateMultiplePlp}\NormalTok{(}\DataTypeTok{analysesLocation =} \StringTok{"./PlpMultiOutput"}\NormalTok{,} - \DataTypeTok{outputLocation =} \StringTok{"./PlpMultiOutput/validation"}\NormalTok{,} - \DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, } - \DataTypeTok{validationSchemaTarget =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_database_1.dbo'}\NormalTok{,} - \StringTok{'new_database_2.dbo'}\NormalTok{),} - \DataTypeTok{validationSchemaOutcome =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_database_1.dbo'}\NormalTok{,} - \StringTok{'new_database_2.dbo'}\NormalTok{),} - \DataTypeTok{validationSchemaCdm =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_database_1.dbo'}\NormalTok{,} - \StringTok{'new_database_2.dbo'}\NormalTok{), } - \DataTypeTok{databaseNames =} \KeywordTok{c}\NormalTok{(}\StringTok{'database1'}\NormalTok{,}\StringTok{'database2'}\NormalTok{),} - \DataTypeTok{validationTableTarget =} \StringTok{'your new cohort table'}\NormalTok{,} - \DataTypeTok{validationTableOutcome =} \StringTok{'your new cohort table'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This then saves the external validation results in the validation folder -of the main study (the outputLocation you used in runPlpAnalyses). - -\hypertarget{viewing-the-results}{% -\section{Viewing the results}\label{viewing-the-results}} - -To view the results for the multiple prediction analysis: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{viewMultiplePlp}\NormalTok{(}\DataTypeTok{analysesLocation=}\StringTok{"./PlpMultiOutput"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -If the validation directory in ``./PlpMultiOutput'' has results, the -external validation will also be displayed. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/BuildingPredictiveModels.tex b/inst/doc/BuildingPredictiveModels.tex index 8a29c5af4..ae3ce9ba2 100644 --- a/inst/doc/BuildingPredictiveModels.tex +++ b/inst/doc/BuildingPredictiveModels.tex @@ -4,18 +4,21 @@ % \documentclass[ ]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex +\usepackage{amsmath,amssymb} +\usepackage{iftex} +\ifPDFTeX \usepackage[T1]{fontenc} \usepackage[utf8]{inputenc} \usepackage{textcomp} % provide euro and other symbols \else % if luatex or xetex - \usepackage{unicode-math} + \usepackage{unicode-math} % this also loads fontspec \defaultfontfeatures{Scale=MatchLowercase} \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} \fi +\usepackage{lmodern} +\ifPDFTeX\else + % xetex/luatex font selection +\fi % Use upquote if available, for straight quotes in verbatim environments \IfFileExists{upquote.sty}{\usepackage{upquote}}{} \IfFileExists{microtype.sty}{% use microtype if available @@ -33,14 +36,6 @@ \KOMAoptions{parskip=half}} \makeatother \usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Building patient-level predictive models}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs \usepackage[margin=1in]{geometry} \usepackage{color} \usepackage{fancyvrb} @@ -53,13 +48,13 @@ \newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} \newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} \newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} +\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} \newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} \newcommand{\BuiltInTok}[1]{#1} \newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} \newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} \newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} +\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} \newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} \newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} \newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} @@ -67,7 +62,7 @@ \newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} \newcommand{\ExtensionTok}[1]{#1} \newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} +\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} \newcommand{\ImportTok}[1]{#1} \newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} \newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} @@ -76,13 +71,14 @@ \newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} \newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} \newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} +\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} \newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} \newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} \newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} \newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} \newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs} +\usepackage{longtable,booktabs,array} +\usepackage{calc} % for calculating minipage widths % Correct order of tables after \paragraph or \subparagraph \usepackage{etoolbox} \makeatletter @@ -91,7 +87,7 @@ % Allow footnotes in longtable head/foot \IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} \makesavenoteenv{longtable} -\usepackage{graphicx,grffile} +\usepackage{graphicx} \makeatletter \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} @@ -112,14 +108,26 @@ \pagestyle{fancy} \fancyhead{} \fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} +\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 6.3.7.9999} \fancyfoot[LE,RO]{\thepage} \renewcommand{\headrulewidth}{0.4pt} \renewcommand{\footrulewidth}{0.4pt} +\ifLuaTeX + \usepackage{selnolig} % disable illegal ligatures +\fi +\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} +\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available +\urlstyle{same} +\hypersetup{ + pdftitle={Building patient-level predictive models}, + pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, + hidelinks, + pdfcreator={LaTeX via pandoc}} \title{Building patient-level predictive models} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} +\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. +Rijnbeek} +\date{2024-04-26} \begin{document} \maketitle @@ -178,7 +186,7 @@ \section{Introduction}\label{introduction}} \begin{figure} \centering -\includegraphics{Figure1.png} +\includegraphics{Figure1.webp} \caption{The prediction problem} \end{figure} @@ -192,13 +200,13 @@ \section{Introduction}\label{introduction}} \begin{figure} \centering -\includegraphics{studydesign.png} +\includegraphics{studydesign.webp} \caption{Design choices} \end{figure} \begin{figure} \centering -\includegraphics{problems.png} +\includegraphics{problems.webp} \caption{Examples of prediction problems} \end{figure} @@ -208,7 +216,7 @@ \section{Introduction}\label{introduction}} and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/InstallationGuide.pdf}{\texttt{InstallationGuide}}. +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/InstallationGuide.pdf}{\texttt{InstallationGuide}}. \hypertarget{study-specification}{% \section{Study specification}\label{study-specification}} @@ -374,157 +382,112 @@ \subsection{Model development lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the -\href{Link\%20to\%20be\%20added}{\texttt{AddingCustomAlgorithms}} +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomModels.pdf}{\texttt{AddingCustomModels}} vignette. Our package currently contains the following algorithms to choose from: -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.11\columnwidth}\raggedright -Algorihm\strut -\end{minipage} & \begin{minipage}[b]{0.55\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.25\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage}\tabularnewline -\midrule +\begin{longtable}[]{@{} + >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1190}} + >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.6071}} + >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2738}}@{}} +\toprule\noalign{} +\begin{minipage}[b]{\linewidth}\raggedright +Algorihm +\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright +Description +\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright +Hyper-parameters +\end{minipage} \\ +\midrule\noalign{} \endhead -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Regularized Logistic Regression\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Lasso logistic regression belongs to the family of generalized linear -models, where a linear combination of the variables is learned and -finally a logistic function maps the linear combination to a value -between 0 and 1. The lasso regularization adds a cost based on model -complexity to the objective function when training the model. This cost -is the sum of the absolute values of the linear combination of the -coefficients. The model automatically performs feature selection by -minimizing this cost. We use the Cyclic coordinate descent for logistic, -Poisson and survival analysis (Cyclops) package to perform large-scale -regularized logistic regression: -\url{https://github.com/OHDSI/Cyclops}\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -var (starting variance), seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Gradient boosting machines\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Gradient boosting machines is a boosting ensemble technique and in our -framework it combines multiple decision trees. Boosting works by -iteratively adding decision trees but adds more weight to the -data-points that are misclassified by prior decision trees in the cost -function when training the next tree. We use Extreme Gradient Boosting, -which is an efficient implementation of the gradient boosting framework -implemented in the xgboost R package available from CRAN.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -ntree (number of trees), max depth (max levels in tree), min rows -(minimum data points in in node), learning rate, balance (balance class -labels), seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Random forest\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Random forest is a bagging ensemble technique that combines multiple -decision trees. The idea behind bagging is to reduce the likelihood of -overfitting, by using weak classifiers, but combining multiple diverse -weak classifiers into a strong classifier. Random forest accomplishes -this by training multiple decision trees but only using a subset of the -variables in each tree and the subset of variables differ between trees. -Our packages uses the sklearn learn implementation of Random Forest in -python.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -mtry (number of features in each tree),ntree (number of trees), maxDepth -(max levels in tree), minRows (minimum data points in in node),balance -(balance class labels), seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -K-nearest neighbors\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -K-nearest neighbors (KNN) is an algorithm that uses some metric to find -the K closest labelled data-points, given the specified metric, to a new -unlabelled data-point. The prediction of the new data-points is then the -most prevalent class of the K-nearest labelled data-points. There is a -sharing limitation of KNN, as the model requires labelled data to -perform the prediction on new data, and it is often not possible to -share this data across data sites.We included the BigKnn classifier -developed in OHDSI which is a large scale k-nearest neighbor classifier -using the Lucene search engine: -\url{https://github.com/OHDSI/BigKnn}\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -k (number of neighbours),weighted (weight by inverse frequency)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Naive Bayes\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -The Naive Bayes algorithm applies the Bayes theorem with the `naive' -assumption of conditional independence between every pair of features -given the value of the class variable. Based on the likelihood the data -belongs to a class and the prior distribution of the class, a posterior -distribution is obtained.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -none\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -AdaBoost\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -AdaBoost is a boosting ensemble technique. Boosting works by iteratively -adding classifiers but adds more weight to the data-points that are -misclassified by prior classifiers in the cost function when training -the next classifier. We use the sklearn `AdaboostClassifier' -implementation in Python.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -nEstimators (the maximum number of estimators at which boosting is -terminated), learningRate (learning rate shrinks the contribution of -each classifier by learning\_rate. There is a trade-off between -learningRate and nEstimators)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Decision Tree\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -A decision tree is a classifier that partitions the variable space using -individual tests selected using a greedy approach. It aims to find -partitions that have the highest information gain to separate the -classes. The decision tree can easily overfit by enabling a large number -of partitions (tree depth) and often needs some regularization (e.g., -pruning or specifying hyper-parameters that limit the complexity of the -model). We use the sklearn `DecisionTreeClassifier' implementation in -Python.\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -maxDepth (the maximum depth of the tree), -minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early -stopping in tree growth. A node will split if its impurity is above the -threshold, otherwise it is a leaf.), seed,classWeight (`Balance' or -`None')\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Multilayer Perception\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Neural networks contain multiple layers that weight their inputs using a -non-linear function. The first layer is the input layer, the last layer -is the output layer the between are the hidden layers. Neural networks -are generally trained using feed forward back-propagation. This is when -you go through the network with a data-point and calculate the error -between the true label and predicted label, then go backwards through -the network and update the linear function weights based on the error. -This can also be performed as a batch, where multiple data-points are -fee\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -size (the number of hidden nodes), alpha (the l2 regularisation), -seed\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.11\columnwidth}\raggedright -Deep Learning\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright +\bottomrule\noalign{} +\endlastfoot +Regularized Logistic Regression & Lasso logistic regression belongs to +the family of generalized linear models, where a linear combination of +the variables is learned and finally a logistic function maps the linear +combination to a value between 0 and 1. The lasso regularization adds a +cost based on model complexity to the objective function when training +the model. This cost is the sum of the absolute values of the linear +combination of the coefficients. The model automatically performs +feature selection by minimizing this cost. We use the Cyclic coordinate +descent for logistic, Poisson and survival analysis (Cyclops) package to +perform large-scale regularized logistic regression: +\url{https://github.com/OHDSI/Cyclops} & var (starting variance), +seed \\ +Gradient boosting machines & Gradient boosting machines is a boosting +ensemble technique and in our framework it combines multiple decision +trees. Boosting works by iteratively adding decision trees but adds more +weight to the data-points that are misclassified by prior decision trees +in the cost function when training the next tree. We use Extreme +Gradient Boosting, which is an efficient implementation of the gradient +boosting framework implemented in the xgboost R package available from +CRAN. & ntree (number of trees), max depth (max levels in tree), min +rows (minimum data points in in node), learning rate, balance (balance +class labels), seed \\ +Random forest & Random forest is a bagging ensemble technique that +combines multiple decision trees. The idea behind bagging is to reduce +the likelihood of overfitting, by using weak classifiers, but combining +multiple diverse weak classifiers into a strong classifier. Random +forest accomplishes this by training multiple decision trees but only +using a subset of the variables in each tree and the subset of variables +differ between trees. Our packages uses the sklearn learn implementation +of Random Forest in python. & mtry (number of features in each +tree),ntree (number of trees), maxDepth (max levels in tree), minRows +(minimum data points in in node),balance (balance class labels), seed \\ +K-nearest neighbors & K-nearest neighbors (KNN) is an algorithm that +uses some metric to find the K closest labelled data-points, given the +specified metric, to a new unlabelled data-point. The prediction of the +new data-points is then the most prevalent class of the K-nearest +labelled data-points. There is a sharing limitation of KNN, as the model +requires labelled data to perform the prediction on new data, and it is +often not possible to share this data across data sites.We included the +BigKnn classifier developed in OHDSI which is a large scale k-nearest +neighbor classifier using the Lucene search engine: +\url{https://github.com/OHDSI/BigKnn} & k (number of +neighbours),weighted (weight by inverse frequency) \\ +Naive Bayes & The Naive Bayes algorithm applies the Bayes theorem with +the `naive' assumption of conditional independence between every pair of +features given the value of the class variable. Based on the likelihood +the data belongs to a class and the prior distribution of the class, a +posterior distribution is obtained. & none \\ +AdaBoost & AdaBoost is a boosting ensemble technique. Boosting works by +iteratively adding classifiers but adds more weight to the data-points +that are misclassified by prior classifiers in the cost function when +training the next classifier. We use the sklearn `AdaboostClassifier' +implementation in Python. & nEstimators (the maximum number of +estimators at which boosting is terminated), learningRate (learning rate +shrinks the contribution of each classifier by learning\_rate. There is +a trade-off between learningRate and nEstimators) \\ +Decision Tree & A decision tree is a classifier that partitions the +variable space using individual tests selected using a greedy approach. +It aims to find partitions that have the highest information gain to +separate the classes. The decision tree can easily overfit by enabling a +large number of partitions (tree depth) and often needs some +regularization (e.g., pruning or specifying hyper-parameters that limit +the complexity of the model). We use the sklearn +`DecisionTreeClassifier' implementation in Python. & maxDepth (the +maximum depth of the tree), minSamplesSplit,minSamplesLeaf, +minImpuritySplit (threshold for early stopping in tree growth. A node +will split if its impurity is above the threshold, otherwise it is a +leaf.), seed,classWeight (`Balance' or `None') \\ +Multilayer Perception & Neural networks contain multiple layers that +weight their inputs using a non-linear function. The first layer is the +input layer, the last layer is the output layer the between are the +hidden layers. Neural networks are generally trained using feed forward +back-propagation. This is when you go through the network with a +data-point and calculate the error between the true label and predicted +label, then go backwards through the network and update the linear +function weights based on the error. This can also be performed as a +batch, where multiple data-points are fee & size (the number of hidden +nodes), alpha (the l2 regularisation), seed \\ +Deep Learning (now in seperate DeepPatientLevelPrediction R package) & Deep learning such as deep nets, convolutional neural networks or recurrent neural networks are similar to a neural network but have multiple hidden layers that aim to learn latent representations useful for prediction. In the seperate BuildingDeepLearningModels vignette we -describe these models and hyper-parameters in more detail\strut -\end{minipage} & \begin{minipage}[t]{0.25\columnwidth}\raggedright -see BuildingDeepLearningModels vignette\strut -\end{minipage}\tabularnewline -\bottomrule +describe these models and hyper-parameters in more detail & see +OHDSI/DeepPatientLevelPrediction \\ \end{longtable} Furthermore, we have to decide on the \textbf{covariates} that we will @@ -578,113 +541,44 @@ \subsection{Study Specification}\label{study-specification-1}} Logistic Regression and will use the default parameters. We will do a 75\%-25\% split by person. -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.42\columnwidth}\raggedright -Definition\strut -\end{minipage} & \begin{minipage}[b]{0.52\columnwidth}\raggedright -Value\strut -\end{minipage}\tabularnewline -\midrule +\begin{longtable}[]{@{} + >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.2361}} + >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.7639}}@{}} +\toprule\noalign{} +\begin{minipage}[b]{\linewidth}\raggedright +Definition +\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright +Value +\end{minipage} \\ +\midrule\noalign{} \endhead -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Problem Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Target Cohort (T)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Patients who are newly diagnosed with Atrial Fibrillation' defined as -the first condition record of cardiac arrhythmia, which is followed by -another cardiac arrhythmia condition record, at least two drug records -for a drug used to treat arrhythmias, or a procedure to treat -arrhythmias.\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Outcome Cohort (O)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Ischemic stroke events' defined as ischemic stroke condition records -during an inpatient or ER visit; successive records with \textgreater{} -180 day gap are considered independent episodes.\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Time-at-risk (TAR)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day till 365 days from cohort start\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Population Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Washout Period\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1095\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Enter the target cohort multiple times?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -No\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Allow prior outcomes?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Yes\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Start of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -End of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -365 days\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Require a minimum amount of time-at-risk?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Yes (364 days)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Model Development}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Algorithm\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Regularized Logistic Regression\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -variance = 0.01 (Default)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Covariates\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Gender, Age, Conditions (ever before, \textless365), Drugs Groups (ever -before, \textless365), and Visit Count\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Data split\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -75\% train, 25\% test. Randomly assigned by person\strut -\end{minipage}\tabularnewline -\bottomrule +\bottomrule\noalign{} +\endlastfoot +\textbf{Problem Definition} & \\ +Target Cohort (T) & `Patients who are newly diagnosed with Atrial +Fibrillation' defined as the first condition record of cardiac +arrhythmia, which is followed by another cardiac arrhythmia condition +record, at least two drug records for a drug used to treat arrhythmias, +or a procedure to treat arrhythmias. \\ +Outcome Cohort (O) & `Ischemic stroke events' defined as ischemic stroke +condition records during an inpatient or ER visit; successive records +with \textgreater{} 180 day gap are considered independent episodes. \\ +Time-at-risk (TAR) & 1 day till 365 days from cohort start \\ +& \\ +\textbf{Population Definition} & \\ +Washout Period & 1095 \\ +Enter the target cohort multiple times? & No \\ +Allow prior outcomes? & Yes \\ +Start of time-at-risk & 1 day \\ +End of time-at-risk & 365 days \\ +Require a minimum amount of time-at-risk? & Yes (364 days) \\ +& \\ +\textbf{Model Development} & \\ +Algorithm & Regularized Logistic Regression \\ +Hyper-parameters & variance = 0.01 (Default) \\ +Covariates & Gender, Age, Conditions (ever before, \textless365), Drugs +Groups (ever before, \textless365), and Visit Count \\ +Data split & 75\% train, 25\% test. Randomly assigned by person \\ \end{longtable} According to the best practices we need to make a protocol that @@ -748,7 +642,7 @@ \subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder}} \begin{figure} \centering -\includegraphics{example1/ATLAS_T.png} +\includegraphics{example1/ATLAS_T.webp} \caption{Target Cohort Atrial Fibrillation} \end{figure} @@ -765,7 +659,7 @@ \subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder}} \begin{figure} \centering -\includegraphics{example1/ATLAS_O.png} +\includegraphics{example1/ATLAS_O.webp} \caption{Outcome Cohort Stroke} \end{figure} @@ -809,94 +703,94 @@ \subsubsection{Custom cohorts}\label{custom-cohorts}} \CommentTok{Create a table to store the persons in the T and C cohort} \CommentTok{*/} -\ControlFlowTok{IF}\NormalTok{ OBJECT_ID(}\StringTok{'@resultsDatabaseSchema.PLPAFibStrokeCohort'}\NormalTok{, }\StringTok{'U'}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} +\ControlFlowTok{IF}\NormalTok{ OBJECT\_ID(}\StringTok{\textquotesingle{}@resultsDatabaseSchema.PLPAFibStrokeCohort\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}U\textquotesingle{}}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} \KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort;} \KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort } \NormalTok{( } -\NormalTok{cohort_definition_id }\DataTypeTok{INT}\NormalTok{, } -\NormalTok{subject_id BIGINT,} -\NormalTok{cohort_start_date }\DataTypeTok{DATE}\NormalTok{, } -\NormalTok{cohort_end_date }\DataTypeTok{DATE} +\NormalTok{cohort\_definition\_id }\DataTypeTok{INT}\NormalTok{, } +\NormalTok{subject\_id BIGINT,} +\NormalTok{cohort\_start\_date }\DataTypeTok{DATE}\NormalTok{, } +\NormalTok{cohort\_end\_date }\DataTypeTok{DATE} \NormalTok{);} \CommentTok{/*} \CommentTok{T cohort: [PatientLevelPrediction vignette]: T : patients who are newly } \CommentTok{diagnosed with Atrial fibrillation} -\CommentTok{- persons with a condition occurrence record of 'Atrial fibrillation' or } +\CommentTok{{-} persons with a condition occurrence record of \textquotesingle{}Atrial fibrillation\textquotesingle{} or } \CommentTok{any descendants, indexed at the first diagnosis} -\CommentTok{- who have >1095 days of prior observation before their first diagnosis} -\CommentTok{- and have no warfarin exposure any time prior to first AFib diagnosis} +\CommentTok{{-} who have \textgreater{}1095 days of prior observation before their first diagnosis} +\CommentTok{{-} and have no warfarin exposure any time prior to first AFib diagnosis} \CommentTok{*/} -\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, } -\NormalTok{subject_id, } -\NormalTok{cohort_start_date, } -\NormalTok{cohort_end_date)} -\KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{AFib.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{AFib.condition_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{observation_period.observation_period_end_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} +\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort\_definition\_id, } +\NormalTok{subject\_id, } +\NormalTok{cohort\_start\_date, } +\NormalTok{cohort\_end\_date)} +\KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} +\NormalTok{AFib.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} +\NormalTok{AFib.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} +\NormalTok{observation\_period.observation\_period\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} \KeywordTok{FROM} \NormalTok{(} - \KeywordTok{SELECT}\NormalTok{ person_id, }\FunctionTok{min}\NormalTok{(condition_start_date) }\KeywordTok{as}\NormalTok{ condition_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} + \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(condition\_start\_date) }\KeywordTok{as}\NormalTok{ condition\_start\_date} + \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence} + \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} +\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} \NormalTok{ (}\DecValTok{313217} \CommentTok{/*atrial fibrillation*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person_id} + \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id} \NormalTok{) AFib} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation_period} - \KeywordTok{ON}\NormalTok{ AFib.person_id }\OperatorTok{=}\NormalTok{ observation_period.person_id} - \KeywordTok{AND}\NormalTok{ AFib.condition_start_date }\OperatorTok{>=}\NormalTok{ dateadd(dd,}\DecValTok{1095}\NormalTok{, } -\NormalTok{ observation_period.observation_period_start_date)} - \KeywordTok{AND}\NormalTok{ AFib.condition_start_date }\OperatorTok{<=}\NormalTok{ observation_period.observation_period_end_date} + \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation\_period} + \KeywordTok{ON}\NormalTok{ AFib.person\_id }\OperatorTok{=}\NormalTok{ observation\_period.person\_id} + \KeywordTok{AND}\NormalTok{ AFib.condition\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ dateadd(dd,}\DecValTok{1095}\NormalTok{, } +\NormalTok{ observation\_period.observation\_period\_start\_date)} + \KeywordTok{AND}\NormalTok{ AFib.condition\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ observation\_period.observation\_period\_end\_date} \KeywordTok{LEFT} \KeywordTok{JOIN} \NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, }\FunctionTok{min}\NormalTok{(drug_exposure_start_date) }\KeywordTok{as}\NormalTok{ drug_exposure_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug_exposure} - \KeywordTok{WHERE}\NormalTok{ drug_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} + \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(drug\_exposure\_start\_date) }\KeywordTok{as}\NormalTok{ drug\_exposure\_start\_date} + \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug\_exposure} + \KeywordTok{WHERE}\NormalTok{ drug\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} +\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} \NormalTok{ (}\DecValTok{1310149} \CommentTok{/*warfarin*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person_id} + \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id} \NormalTok{ ) warfarin} - \KeywordTok{ON}\NormalTok{ Afib.person_id }\OperatorTok{=}\NormalTok{ warfarin.person_id} - \KeywordTok{AND}\NormalTok{ Afib.condition_start_date }\OperatorTok{>}\NormalTok{ warfarin.drug_exposure_start_date} - \KeywordTok{WHERE}\NormalTok{ warfarin.person_id }\KeywordTok{IS} \KeywordTok{NULL} + \KeywordTok{ON}\NormalTok{ Afib.person\_id }\OperatorTok{=}\NormalTok{ warfarin.person\_id} + \KeywordTok{AND}\NormalTok{ Afib.condition\_start\_date }\OperatorTok{\textgreater{}}\NormalTok{ warfarin.drug\_exposure\_start\_date} + \KeywordTok{WHERE}\NormalTok{ warfarin.person\_id }\KeywordTok{IS} \KeywordTok{NULL} \NormalTok{ ;} \CommentTok{/*} \CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Ischemic stroke events} -\CommentTok{ - inpatient visits that include a condition occurrence record for } -\CommentTok{ 'cerebral infarction' and descendants, 'cerebral thrombosis', } -\CommentTok{ 'cerebral embolism', 'cerebral artery occlusion' } +\CommentTok{ {-} inpatient visits that include a condition occurrence record for } +\CommentTok{ \textquotesingle{}cerebral infarction\textquotesingle{} and descendants, \textquotesingle{}cerebral thrombosis\textquotesingle{}, } +\CommentTok{ \textquotesingle{}cerebral embolism\textquotesingle{}, \textquotesingle{}cerebral artery occlusion\textquotesingle{} } \CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort_definition_id, } -\NormalTok{ subject_id, } -\NormalTok{ cohort_start_date, } -\NormalTok{ cohort_end_date)} - \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{ visit_occurrence.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{ visit_occurrence.visit_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{ visit_occurrence.visit_end_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} + \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort\_definition\_id, } +\NormalTok{ subject\_id, } +\NormalTok{ cohort\_start\_date, } +\NormalTok{ cohort\_end\_date)} + \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} +\NormalTok{ visit\_occurrence.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} +\NormalTok{ visit\_occurrence.visit\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} +\NormalTok{ visit\_occurrence.visit\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} \KeywordTok{FROM} \NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, condition_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{443454} \CommentTok{/*cerebral infarction*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant_concept_id }\KeywordTok{IN} + \KeywordTok{SELECT}\NormalTok{ person\_id, condition\_start\_date} + \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence} + \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} +\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} +\NormalTok{ (}\DecValTok{443454} \CommentTok{/*cerebral infarction*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant\_concept\_id }\KeywordTok{IN} \NormalTok{ (}\DecValTok{441874} \CommentTok{/*cerebral thrombosis*/}\NormalTok{, }\DecValTok{375557} \CommentTok{/*cerebral embolism*/}\NormalTok{, } \DecValTok{372924} \CommentTok{/*cerebral artery occlusion*/}\NormalTok{))} \NormalTok{ ) stroke} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.visit_occurrence} - \KeywordTok{ON}\NormalTok{ stroke.person_id }\OperatorTok{=}\NormalTok{ visit_occurrence.person_id} - \KeywordTok{AND}\NormalTok{ stroke.condition_start_date }\OperatorTok{>=}\NormalTok{ visit_occurrence.visit_start_date} - \KeywordTok{AND}\NormalTok{ stroke.condition_start_date }\OperatorTok{<=}\NormalTok{ visit_occurrence.visit_end_date} - \KeywordTok{AND}\NormalTok{ visit_occurrence.visit_concept_id }\KeywordTok{IN}\NormalTok{ (}\DecValTok{9201}\NormalTok{, }\DecValTok{262} \CommentTok{/*'Inpatient Visit' or } -\CommentTok{ 'Emergency Room and Inpatient Visit'*/}\NormalTok{)} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ visit_occurrence.person_id, visit_occurrence.visit_start_date, } -\NormalTok{ visit_occurrence.visit_end_date} + \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.visit\_occurrence} + \KeywordTok{ON}\NormalTok{ stroke.person\_id }\OperatorTok{=}\NormalTok{ visit\_occurrence.person\_id} + \KeywordTok{AND}\NormalTok{ stroke.condition\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ visit\_occurrence.visit\_start\_date} + \KeywordTok{AND}\NormalTok{ stroke.condition\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ visit\_occurrence.visit\_end\_date} + \KeywordTok{AND}\NormalTok{ visit\_occurrence.visit\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\DecValTok{9201}\NormalTok{, }\DecValTok{262} \CommentTok{/*\textquotesingle{}Inpatient Visit\textquotesingle{} or } +\CommentTok{ \textquotesingle{}Emergency Room and Inpatient Visit\textquotesingle{}*/}\NormalTok{)} + \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ visit\_occurrence.person\_id, visit\_occurrence.visit\_start\_date, } +\NormalTok{ visit\_occurrence.visit\_end\_date} \NormalTok{ ;} \end{Highlighting} @@ -922,14 +816,14 @@ \subsubsection{Custom cohorts}\label{custom-cohorts}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} +\NormalTok{ connectionDetails }\OtherTok{\textless{}{-}} \FunctionTok{createConnectionDetails}\NormalTok{(}\AttributeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } + \AttributeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } + \AttributeTok{user =} \StringTok{"joe"}\NormalTok{, } + \AttributeTok{password =} \StringTok{"supersecret"}\NormalTok{)} -\NormalTok{ cdmDatabaseSchema <-}\StringTok{ "my_cdm_data"} -\NormalTok{ cohortsDatabaseSchema <-}\StringTok{ "my_results"} -\NormalTok{ cdmVersion <-}\StringTok{ "5"} +\NormalTok{ cdmDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_cdm\_data"} +\NormalTok{ cohortsDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_results"} +\NormalTok{ cdmVersion }\OtherTok{\textless{}{-}} \StringTok{"5"} \end{Highlighting} \end{Shaded} @@ -943,17 +837,17 @@ \subsubsection{Custom cohorts}\label{custom-cohorts}} \begin{Shaded} \begin{Highlighting}[] - \KeywordTok{library}\NormalTok{(SqlRender)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{readSql}\NormalTok{(}\StringTok{"AfStrokeCohorts.sql"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{renderSql}\NormalTok{(sql,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema,} - \DataTypeTok{post_time =} \DecValTok{30}\NormalTok{,} - \DataTypeTok{pre_time =} \DecValTok{365}\NormalTok{)}\OperatorTok{$}\NormalTok{sql} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translateSql}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)}\OperatorTok{$}\NormalTok{sql} + \FunctionTok{library}\NormalTok{(SqlRender)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{readSql}\NormalTok{(}\StringTok{"AfStrokeCohorts.sql"}\NormalTok{)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{renderSql}\NormalTok{(sql,} + \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} + \AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema,} + \AttributeTok{post\_time =} \DecValTok{30}\NormalTok{,} + \AttributeTok{pre\_time =} \DecValTok{365}\NormalTok{)}\SpecialCharTok{$}\NormalTok{sql} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translateSql}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}\SpecialCharTok{$}\NormalTok{sql} -\NormalTok{ connection <-}\StringTok{ }\KeywordTok{connect}\NormalTok{(connectionDetails)} - \KeywordTok{executeSql}\NormalTok{(connection, sql)} +\NormalTok{ connection }\OtherTok{\textless{}{-}} \FunctionTok{connect}\NormalTok{(connectionDetails)} + \FunctionTok{executeSql}\NormalTok{(connection, sql)} \end{Highlighting} \end{Shaded} @@ -968,13 +862,13 @@ \subsubsection{Custom cohorts}\label{custom-cohorts}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{paste}\NormalTok{(}\StringTok{"SELECT cohort_definition_id, COUNT(*) AS count"}\NormalTok{,} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(}\StringTok{"SELECT cohort\_definition\_id, COUNT(*) AS count"}\NormalTok{,} \StringTok{"FROM @cohortsDatabaseSchema.AFibStrokeCohort"}\NormalTok{,} - \StringTok{"GROUP BY cohort_definition_id"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{renderSql}\NormalTok{(sql, }\DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}\OperatorTok{$}\NormalTok{sql} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translateSql}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)}\OperatorTok{$}\NormalTok{sql} + \StringTok{"GROUP BY cohort\_definition\_id"}\NormalTok{)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{renderSql}\NormalTok{(sql, }\AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}\SpecialCharTok{$}\NormalTok{sql} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translateSql}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}\SpecialCharTok{$}\NormalTok{sql} - \KeywordTok{querySql}\NormalTok{(connection, sql)} + \FunctionTok{querySql}\NormalTok{(connection, sql)} \end{Highlighting} \end{Shaded} @@ -997,25 +891,25 @@ \subsubsection{Data extraction}\label{data-extraction}} Now we can tell \texttt{PatientLevelPrediction} to extract all necessary data for our analysis. This is done using the -\href{https://github.com/OHDSI/FeatureExtration}{\texttt{FeatureExtractionPackage}}. +\href{https://github.com/OHDSI/FeatureExtraction}{\texttt{FeatureExtractionPackage}}. In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g.~all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its -\href{https://github.com/OHDSI/FeatureExtration}{vignettes}. For our +\href{https://github.com/OHDSI/FeatureExtraction}{vignettes}. For our example study we decided to use these settings: \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ covariateSettings <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDemographicsAge =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useVisitConceptCountLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{longTermStartDays =} \DecValTok{-365}\NormalTok{,} - \DataTypeTok{endDays =} \DecValTok{-1}\NormalTok{)} +\NormalTok{ covariateSettings }\OtherTok{\textless{}{-}} \FunctionTok{createCovariateSettings}\NormalTok{(}\AttributeTok{useDemographicsGender =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useDemographicsAge =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useConditionGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useConditionGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useDrugGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useDrugGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useVisitConceptCountLongTerm =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{longTermStartDays =} \SpecialCharTok{{-}}\DecValTok{365}\NormalTok{,} + \AttributeTok{endDays =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)} \end{Highlighting} \end{Shaded} @@ -1029,37 +923,50 @@ \subsubsection{Data extraction}\label{data-extraction}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ plpData <-}\StringTok{ }\KeywordTok{getPlpData}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{cohortTable =} \StringTok{'AFibStrokeCohort'}\NormalTok{,} - \DataTypeTok{cohortId =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{covariateSettings =}\NormalTok{ covariateSettings,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{'AFibStrokeCohort'}\NormalTok{,} - \DataTypeTok{outcomeIds =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{sampleSize =} \DecValTok{10000} +\NormalTok{databaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{(} + \AttributeTok{connectionDetails =}\NormalTok{ connectionDetails,} + \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} + \AttributeTok{cdmDatabaseName =} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{,} + \AttributeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} + \AttributeTok{cohortTable =} \StringTok{\textquotesingle{}AFibStrokeCohort\textquotesingle{}}\NormalTok{,} + \AttributeTok{cohortId =} \DecValTok{1}\NormalTok{,} + \AttributeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} + \AttributeTok{outcomeTable =} \StringTok{\textquotesingle{}AFibStrokeCohort\textquotesingle{}}\NormalTok{,} + \AttributeTok{outcomeIds =} \DecValTok{2}\NormalTok{,} + \AttributeTok{cdmVersion =} \DecValTok{5} +\NormalTok{ )} + +\CommentTok{\# here you can define whether you want to sample the target cohort and add any} +\CommentTok{\# restrictions based on minimum prior observation, index date restrictions} +\CommentTok{\# or restricting to first index date (if people can be in target cohort multiple times)} +\NormalTok{restrictPlpDataSettings }\OtherTok{\textless{}{-}} \FunctionTok{createRestrictPlpDataSettings}\NormalTok{(}\AttributeTok{sampleSize =} \DecValTok{10000}\NormalTok{)} + +\NormalTok{ plpData }\OtherTok{\textless{}{-}} \FunctionTok{getPlpData}\NormalTok{(} + \AttributeTok{databaseDetails =}\NormalTok{ databaseDetails, } + \AttributeTok{covariateSettings =}\NormalTok{ covariateSettings,} + \AttributeTok{restrictPlpDataSettings =}\NormalTok{ restrictPlpDataSettings} \NormalTok{ )} \end{Highlighting} \end{Shaded} Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional -parameters for the \texttt{getPlpData} function which are all documented -in the \texttt{PatientLevelPrediction} manual. The resulting -\texttt{plpData} object uses the package \texttt{ff} to store +parameters for the \texttt{createRestrictPlpDataSettings} function which +are all documented in the \texttt{PatientLevelPrediction} manual. The +resulting \texttt{plpData} object uses the package \texttt{Andromeda} +(which uses \href{https://www.sqlite.org/index.html}{SQLite}) to store information in a way that ensures R does not run out of memory, even when the data are large. Creating the \texttt{plpData} object can take considerable computing time, and it is probably a good idea to save it for future sessions. -Because \texttt{plpData} uses \texttt{ff}, we cannot use R's regular -save function. Instead, we'll have to use the \texttt{savePlpData()} -function: +Because \texttt{plpData} uses \texttt{Andromeda}, we cannot use R's +regular save function. Instead, we'll have to use the +\texttt{savePlpData()} function: \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{savePlpData}\NormalTok{(plpData, }\StringTok{"stroke_in_af_data"}\NormalTok{)} + \FunctionTok{savePlpData}\NormalTok{(plpData, }\StringTok{"stroke\_in\_af\_data"}\NormalTok{)} \end{Highlighting} \end{Shaded} @@ -1092,24 +999,138 @@ \subsubsection{Additional inclusion \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(}\DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{1095}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includeAllOutcomes =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"DEBUG"} +\NormalTok{ populationSettings }\OtherTok{\textless{}{-}} \FunctionTok{createStudyPopulationSettings}\NormalTok{(} + \AttributeTok{washoutPeriod =} \DecValTok{1095}\NormalTok{,} + \AttributeTok{firstExposureOnly =} \ConstantTok{FALSE}\NormalTok{,} + \AttributeTok{removeSubjectsWithPriorOutcome =} \ConstantTok{FALSE}\NormalTok{,} + \AttributeTok{priorOutcomeLookback =} \DecValTok{1}\NormalTok{,} + \AttributeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} + \AttributeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} + \AttributeTok{startAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} + \AttributeTok{endAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} + \AttributeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} + \AttributeTok{requireTimeAtRisk =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{includeAllOutcomes =} \ConstantTok{TRUE} \NormalTok{ )} \end{Highlighting} \end{Shaded} +\hypertarget{spliting-the-data-into-trainingvalidationtesting-datasets}{% +\subsubsection{Spliting the data into training/validation/testing +datasets}\label{spliting-the-data-into-trainingvalidationtesting-datasets}} + +When developing a prediction model using supervised learning (when you +have features paired with labels for a set of patients), the first step +is to design the development/internal validation process. This requires +specifying how to select the model hyper-parameters, how to learn the +model parameters and how to fairly evaluate the model. In general, the +validation set is used to pick hyper-parameters, the training set is +used to learn the model parameters and the test set is used to perform +fair internal validation. However, cross-validation can be implemented +to pick the hyper-parameters on the training data (so a validation data +set is not required). Cross validation can also be used to estimate +internal validation (so a testing data set is not required). + +In small data the best approach for internal validation has been shown +to be boostrapping. However, in big data (many patients and many +features) bootstrapping is generally not feasible. In big data our +research has shown that it is just important to have some form of fair +evaluation (use a test set or cross validation). For full details see +\href{add\%20link}{our BMJ open paper}. + +In the PatientLevelPrediction package, the splitSettings define how the +plpData are partitioned into training/validation/testing data. Cross +validation is always done, but using a test set is optional (when the +data are small, it may be optimal to not use a test set). For the +splitSettings we can use the type (stratified/time/subject) and +testFraction parameters to split the data in a 75\%-25\% split and run +the patient-level prediction pipeline: + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ splitSettings }\OtherTok{\textless{}{-}} \FunctionTok{createDefaultSplitSetting}\NormalTok{(} + \AttributeTok{trainFraction =} \FloatTok{0.75}\NormalTok{,} + \AttributeTok{testFraction =} \FloatTok{0.25}\NormalTok{,} + \AttributeTok{type =} \StringTok{\textquotesingle{}stratified\textquotesingle{}}\NormalTok{,} + \AttributeTok{nfold =} \DecValTok{2}\NormalTok{, } + \AttributeTok{splitSeed =} \DecValTok{1234} +\NormalTok{ )} +\end{Highlighting} +\end{Shaded} + +Note: it is possible to add a custom method to specify how the plpData +are partitioned into training/validation/testing data, see +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf}{vignette +for custom splitting}. + +\hypertarget{preprocessing-the-training-data}{% +\subsubsection{Preprocessing the training +data}\label{preprocessing-the-training-data}} + +There a numerous data processing settings that a user must specify when +developing a prediction model. These are: * Whether to under-sample or +over-sample the training data (this may be useful when there is class +imballance (e.g., the outcome is very rare or very common)) * Whether to +perform feature engineering or feature selection (e.g., create latent +variables that are not observed in the data or reduce the dimensionality +of the data) * Whether to remove redundant features and normalize the +data (this is required for some models) + +The default sample settings does nothing, it simply returns the +trainData as input, see below: + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ sampleSettings }\OtherTok{\textless{}{-}} \FunctionTok{createSampleSettings}\NormalTok{()} +\end{Highlighting} +\end{Shaded} + +However, the current package contains methods of under-sampling the +non-outcome patients. To perform undersampling, the \texttt{type} input +should be `underSample' and \texttt{numberOutcomestoNonOutcomes} must be +specified (an integer specifying the number of non-outcomes per +outcome). It is possible to add any custom function for over/under +sampling, see +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf}{vignette +for custom sampling}. + +It is possible to specify a combination of feature engineering functions +that take as input the trainData and output a new trainData with +different features. The default feature engineering setting does +nothing: + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ featureEngineeringSettings }\OtherTok{\textless{}{-}} \FunctionTok{createFeatureEngineeringSettings}\NormalTok{()} +\end{Highlighting} +\end{Shaded} + +However, it is possible to add custom feature engineering functions into +the pipeline, see +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomFeatureEngineering.pdf}{vignette +for custom feature engineering}. + +Finally, the preprocessing setting is required. For this setting the +user can define \texttt{minFraction}, this removes any features that is +observed in the training data for less than 0.01 fraction of the +patients. So, if \texttt{minFraction\ =\ 0.01} then any feature that is +seen in less than 1 percent of the target population is removed. The +input \texttt{normalize} specifies whether the features are scaled +between 0 and 1, this is required for certain models (e.g., LASSO +logistic regression). The input \texttt{removeRedundancy} specifies +whether features that are observed in all of the target population are +removed. + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ preprocessSettingsSettings }\OtherTok{\textless{}{-}} \FunctionTok{createPreprocessSettings}\NormalTok{(} + \AttributeTok{minFraction =} \FloatTok{0.01}\NormalTok{, } + \AttributeTok{normalize =}\NormalTok{ T, } + \AttributeTok{removeRedundancy =}\NormalTok{ T} +\NormalTok{ )} +\end{Highlighting} +\end{Shaded} + \hypertarget{model-development}{% \subsubsection{Model Development}\label{model-development}} @@ -1131,19 +1152,41 @@ \subsubsection{Model Development}\label{model-development}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{lrModel <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} +\NormalTok{lrModel }\OtherTok{\textless{}{-}} \FunctionTok{setLassoLogisticRegression}\NormalTok{()} \end{Highlighting} \end{Shaded} -The \texttt{runPlP} function uses the population, \texttt{plpData}, and -model settings to train and evaluate the model. We can use the testSplit -(person/time) and testFraction parameters to split the data in a -75\%-25\% split and run the patient-level prediction pipeline: +The \texttt{runPlP} function requires the \texttt{plpData}, the +\texttt{outcomeId} specifying the outcome being predicted and the +settings: \texttt{populationSettings}, \texttt{splitSettings}, +\texttt{sampleSettings}, \texttt{featureEngineeringSettings}, +\texttt{preprocessSettings} and \texttt{modelSettings} to train and +evaluate the model. \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ lrResults <-}\StringTok{ }\KeywordTok{runPlp}\NormalTok{(population, plpData, }\DataTypeTok{modelSettings =}\NormalTok{ lrModel, }\DataTypeTok{testSplit=}\StringTok{'stratified'}\NormalTok{, } - \DataTypeTok{testFraction=}\FloatTok{0.25}\NormalTok{, }\DataTypeTok{nfold=}\DecValTok{2}\NormalTok{, }\DataTypeTok{splitSeed =} \DecValTok{1234}\NormalTok{)} +\NormalTok{ lrResults }\OtherTok{\textless{}{-}} \FunctionTok{runPlp}\NormalTok{(} + \AttributeTok{plpData =}\NormalTok{ plpData,} + \AttributeTok{outcomeId =} \DecValTok{2}\NormalTok{, } + \AttributeTok{analysisId =} \StringTok{\textquotesingle{}singleDemo\textquotesingle{}}\NormalTok{,} + \AttributeTok{analysisName =} \StringTok{\textquotesingle{}Demonstration of runPlp for training single PLP models\textquotesingle{}}\NormalTok{,} + \AttributeTok{populationSettings =}\NormalTok{ populationSettings, } + \AttributeTok{splitSettings =}\NormalTok{ splitSettings,} + \AttributeTok{sampleSettings =}\NormalTok{ sampleSettings, } + \AttributeTok{featureEngineeringSettings =}\NormalTok{ featureEngineeringSettings, } + \AttributeTok{preprocessSettings =}\NormalTok{ preprocessSettings,} + \AttributeTok{modelSettings =}\NormalTok{ lrModel,} + \AttributeTok{logSettings =} \FunctionTok{createLogSettings}\NormalTok{(), } + \AttributeTok{executeSettings =} \FunctionTok{createExecuteSettings}\NormalTok{(} + \AttributeTok{runSplitData =}\NormalTok{ T, } + \AttributeTok{runSampleData =}\NormalTok{ T, } + \AttributeTok{runfeatureEngineering =}\NormalTok{ T, } + \AttributeTok{runPreprocessData =}\NormalTok{ T, } + \AttributeTok{runModelDevelopment =}\NormalTok{ T, } + \AttributeTok{runCovariateSummary =}\NormalTok{ T} +\NormalTok{ ), } + \AttributeTok{saveDirectory =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{\textquotesingle{}singlePlp\textquotesingle{}}\NormalTok{)} +\NormalTok{ )} \end{Highlighting} \end{Shaded} @@ -1153,15 +1196,11 @@ \subsubsection{Model Development}\label{model-development}} evaluate the model on the remaining 25\%. A results data structure is returned containing information about the model, its performance etc. -In the runPlp function there are several parameters to save the plpData, -plpResults, plpPlots, evaluation etc. which are all set to True by -default. However, there is also some functionality to this manually. - You can save the model using: \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{savePlpModel}\NormalTok{(lrResults}\OperatorTok{$}\NormalTok{model, }\DataTypeTok{dirPath =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} +\FunctionTok{savePlpModel}\NormalTok{(lrResults}\SpecialCharTok{$}\NormalTok{model, }\AttributeTok{dirPath =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1169,7 +1208,7 @@ \subsubsection{Model Development}\label{model-development}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} +\NormalTok{ plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1177,7 +1216,7 @@ \subsubsection{Model Development}\label{model-development}} \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{savePlpResult}\NormalTok{(lrResults, }\DataTypeTok{location =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"lr"}\NormalTok{))} + \FunctionTok{savePlpResult}\NormalTok{(lrResults, }\AttributeTok{location =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}lr\textquotesingle{}}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1185,7 +1224,7 @@ \subsubsection{Model Development}\label{model-development}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{lrResults <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"lr"}\NormalTok{))} +\NormalTok{ lrResults }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpResult}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}lr\textquotesingle{}}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1197,110 +1236,41 @@ \section{Example 2: Angioedema in ACE inhibitor users}\label{example2}} \hypertarget{study-specification-2}{% \subsection{Study Specification}\label{study-specification-2}} -\begin{longtable}[]{@{}ll@{}} -\toprule -\begin{minipage}[b]{0.42\columnwidth}\raggedright -Definition\strut -\end{minipage} & \begin{minipage}[b]{0.52\columnwidth}\raggedright -Value\strut -\end{minipage}\tabularnewline -\midrule +\begin{longtable}[]{@{} + >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.3056}} + >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.6944}}@{}} +\toprule\noalign{} +\begin{minipage}[b]{\linewidth}\raggedright +Definition +\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright +Value +\end{minipage} \\ +\midrule\noalign{} \endhead -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Problem Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Target Cohort (T)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Patients who are newly dispensed an ACE inhibitor' defined as the first -drug record of any ACE inhibitor\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Outcome Cohort (O)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -`Angioedema' defined as an angioedema condition record during an -inpatient or ER visit\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Time-at-risk (TAR)\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day till 365 days from cohort start\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Population Definition}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Washout Period\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -365\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Enter the target cohort multiple times?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -No\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Allow prior outcomes?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -No\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Start of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -1 day\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -End of time-at-risk\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -365 days\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Require a minimum amount of time-at-risk?\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Yes (364 days)\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -\textbf{Model Development}\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Algorithm\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Gradient Boosting Machine\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Hyper-parameters\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or -0.1 or 0.9\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Covariates\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -Gender, Age, Conditions (ever before, \textless365), Drugs Groups (ever -before, \textless365), and Visit Count\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.42\columnwidth}\raggedright -Data split\strut -\end{minipage} & \begin{minipage}[t]{0.52\columnwidth}\raggedright -75\% train, 25\% test. Randomly assigned by person\strut -\end{minipage}\tabularnewline -\bottomrule +\bottomrule\noalign{} +\endlastfoot +\textbf{Problem Definition} & \\ +Target Cohort (T) & `Patients who are newly dispensed an ACE inhibitor' +defined as the first drug record of any ACE inhibitor \\ +Outcome Cohort (O) & `Angioedema' defined as an angioedema condition +record during an inpatient or ER visit \\ +Time-at-risk (TAR) & 1 day till 365 days from cohort start \\ +& \\ +\textbf{Population Definition} & \\ +Washout Period & 365 \\ +Enter the target cohort multiple times? & No \\ +Allow prior outcomes? & No \\ +Start of time-at-risk & 1 day \\ +End of time-at-risk & 365 days \\ +Require a minimum amount of time-at-risk? & Yes (364 days) \\ +& \\ +\textbf{Model Development} & \\ +Algorithm & Gradient Boosting Machine \\ +Hyper-parameters & ntree:5000, max depth:4 or 7 or 10 and learning rate: +0.001 or 0.01 or 0.1 or 0.9 \\ +Covariates & Gender, Age, Conditions (ever before, \textless365), Drugs +Groups (ever before, \textless365), and Visit Count \\ +Data split & 75\% train, 25\% test. Randomly assigned by person \\ \end{longtable} According to the best practices we need to make a protocol that @@ -1364,7 +1334,7 @@ \subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder-1}} \begin{figure} \centering -\includegraphics{example2/aceinhibitors.png} +\includegraphics{example2/aceinhibitors.webp} \caption{Target Cohort ACE inhibitors} \end{figure} @@ -1381,7 +1351,7 @@ \subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder-1}} \begin{figure} \centering -\includegraphics{example2/angioedema.png} +\includegraphics{example2/angioedema.webp} \caption{Outcome Cohort Angioedema} \end{figure} @@ -1425,67 +1395,67 @@ \subsubsection{Custom cohorts}\label{custom-cohorts-1}} \CommentTok{ Create a table to store the persons in the T and C cohort} \CommentTok{ */} - \ControlFlowTok{IF}\NormalTok{ OBJECT_ID(}\StringTok{'@resultsDatabaseSchema.PLPAceAngioCohort'}\NormalTok{, }\StringTok{'U'}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} + \ControlFlowTok{IF}\NormalTok{ OBJECT\_ID(}\StringTok{\textquotesingle{}@resultsDatabaseSchema.PLPAceAngioCohort\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}U\textquotesingle{}}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} \KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort;} \KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort } \NormalTok{ ( } -\NormalTok{ cohort_definition_id }\DataTypeTok{INT}\NormalTok{, } -\NormalTok{ subject_id BIGINT,} -\NormalTok{ cohort_start_date }\DataTypeTok{DATE}\NormalTok{, } -\NormalTok{ cohort_end_date }\DataTypeTok{DATE} +\NormalTok{ cohort\_definition\_id }\DataTypeTok{INT}\NormalTok{, } +\NormalTok{ subject\_id BIGINT,} +\NormalTok{ cohort\_start\_date }\DataTypeTok{DATE}\NormalTok{, } +\NormalTok{ cohort\_end\_date }\DataTypeTok{DATE} \NormalTok{ );} \CommentTok{/*} \CommentTok{ T cohort: [PatientLevelPrediction vignette]: T : patients who are newly } \CommentTok{ dispensed an ACE inhibitor} -\CommentTok{ - persons with a drug exposure record of any 'ACE inhibitor' or } +\CommentTok{ {-} persons with a drug exposure record of any \textquotesingle{}ACE inhibitor\textquotesingle{} or } \CommentTok{ any descendants, indexed at the first diagnosis} -\CommentTok{ - who have >364 days of prior observation before their first dispensing} +\CommentTok{ {-} who have \textgreater{}364 days of prior observation before their first dispensing} \CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, } -\NormalTok{ subject_id, } -\NormalTok{ cohort_start_date, } -\NormalTok{ cohort_end_date)} - \KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{ Ace.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{ Ace.drug_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{ observation_period.observation_period_end_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} + \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort\_definition\_id, } +\NormalTok{ subject\_id, } +\NormalTok{ cohort\_start\_date, } +\NormalTok{ cohort\_end\_date)} + \KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} +\NormalTok{ Ace.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} +\NormalTok{ Ace.drug\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} +\NormalTok{ observation\_period.observation\_period\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} \KeywordTok{FROM} \NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, }\FunctionTok{min}\NormalTok{(drug_exposure_date) }\KeywordTok{as}\NormalTok{ drug_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug_exposure} - \KeywordTok{WHERE}\NormalTok{ drug_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} + \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(drug\_exposure\_date) }\KeywordTok{as}\NormalTok{ drug\_start\_date} + \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug\_exposure} + \KeywordTok{WHERE}\NormalTok{ drug\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} +\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} \NormalTok{ (}\DecValTok{1342439}\NormalTok{,}\DecValTok{1334456}\NormalTok{, }\DecValTok{1331235}\NormalTok{, }\DecValTok{1373225}\NormalTok{, }\DecValTok{1310756}\NormalTok{, }\DecValTok{1308216}\NormalTok{, }\DecValTok{1363749}\NormalTok{, }\DecValTok{1341927}\NormalTok{, }\DecValTok{1340128}\NormalTok{, }\DecValTok{1335471} \CommentTok{/*ace inhibitors*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person_id} + \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id} \NormalTok{ ) Ace} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation_period} - \KeywordTok{ON}\NormalTok{ Ace.person_id }\OperatorTok{=}\NormalTok{ observation_period.person_id} - \KeywordTok{AND}\NormalTok{ Ace.drug_start_date }\OperatorTok{>=}\NormalTok{ dateadd(dd,}\DecValTok{364}\NormalTok{, } -\NormalTok{ observation_period.observation_period_start_date)} - \KeywordTok{AND}\NormalTok{ Ace.drug_start_date }\OperatorTok{<=}\NormalTok{ observation_period.observation_period_end_date} + \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation\_period} + \KeywordTok{ON}\NormalTok{ Ace.person\_id }\OperatorTok{=}\NormalTok{ observation\_period.person\_id} + \KeywordTok{AND}\NormalTok{ Ace.drug\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ dateadd(dd,}\DecValTok{364}\NormalTok{, } +\NormalTok{ observation\_period.observation\_period\_start\_date)} + \KeywordTok{AND}\NormalTok{ Ace.drug\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ observation\_period.observation\_period\_end\_date} \NormalTok{ ;} \CommentTok{/*} \CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Angioedema} \CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort_definition_id, } -\NormalTok{ subject_id, } -\NormalTok{ cohort_start_date, } -\NormalTok{ cohort_end_date)} - \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort_definition_id,} -\NormalTok{ angioedema.person_id }\KeywordTok{AS}\NormalTok{ subject_id,} -\NormalTok{ angioedema.condition_start_date }\KeywordTok{AS}\NormalTok{ cohort_start_date,} -\NormalTok{ angioedema.condition_start_date }\KeywordTok{AS}\NormalTok{ cohort_end_date} + \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort\_definition\_id, } +\NormalTok{ subject\_id, } +\NormalTok{ cohort\_start\_date, } +\NormalTok{ cohort\_end\_date)} + \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} +\NormalTok{ angioedema.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} +\NormalTok{ angioedema.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} +\NormalTok{ angioedema.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} \KeywordTok{FROM} \NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person_id, condition_start_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition_concept_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant_concept_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor_concept_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant_concept_id }\KeywordTok{IN} + \KeywordTok{SELECT}\NormalTok{ person\_id, condition\_start\_date} + \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence} + \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} +\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} +\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant\_concept\_id }\KeywordTok{IN} \NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{)} \NormalTok{ ) angioedema} @@ -1514,14 +1484,14 @@ \subsubsection{Custom cohorts}\label{custom-cohorts-1}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} +\NormalTok{ connectionDetails }\OtherTok{\textless{}{-}} \FunctionTok{createConnectionDetails}\NormalTok{(}\AttributeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } + \AttributeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } + \AttributeTok{user =} \StringTok{"joe"}\NormalTok{, } + \AttributeTok{password =} \StringTok{"supersecret"}\NormalTok{)} -\NormalTok{ cdmDatabaseSchema <-}\StringTok{ "my_cdm_data"} -\NormalTok{ cohortsDatabaseSchema <-}\StringTok{ "my_results"} -\NormalTok{ cdmVersion <-}\StringTok{ "5"} +\NormalTok{ cdmDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_cdm\_data"} +\NormalTok{ cohortsDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_results"} +\NormalTok{ cdmVersion }\OtherTok{\textless{}{-}} \StringTok{"5"} \end{Highlighting} \end{Shaded} @@ -1535,15 +1505,15 @@ \subsubsection{Custom cohorts}\label{custom-cohorts-1}} \begin{Shaded} \begin{Highlighting}[] - \KeywordTok{library}\NormalTok{(SqlRender)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{readSql}\NormalTok{(}\StringTok{"AceAngioCohorts.sql"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{render}\NormalTok{(sql,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translate}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)} + \FunctionTok{library}\NormalTok{(SqlRender)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{readSql}\NormalTok{(}\StringTok{"AceAngioCohorts.sql"}\NormalTok{)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{render}\NormalTok{(sql,} + \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} + \AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translate}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)} -\NormalTok{ connection <-}\StringTok{ }\KeywordTok{connect}\NormalTok{(connectionDetails)} - \KeywordTok{executeSql}\NormalTok{(connection, sql)} +\NormalTok{ connection }\OtherTok{\textless{}{-}} \FunctionTok{connect}\NormalTok{(connectionDetails)} + \FunctionTok{executeSql}\NormalTok{(connection, sql)} \end{Highlighting} \end{Shaded} @@ -1558,13 +1528,13 @@ \subsubsection{Custom cohorts}\label{custom-cohorts-1}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{paste}\NormalTok{(}\StringTok{"SELECT cohort_definition_id, COUNT(*) AS count"}\NormalTok{,} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(}\StringTok{"SELECT cohort\_definition\_id, COUNT(*) AS count"}\NormalTok{,} \StringTok{"FROM @cohortsDatabaseSchema.AceAngioCohort"}\NormalTok{,} - \StringTok{"GROUP BY cohort_definition_id"}\NormalTok{)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{render}\NormalTok{(sql, }\DataTypeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} -\NormalTok{ sql <-}\StringTok{ }\KeywordTok{translate}\NormalTok{(sql, }\DataTypeTok{targetDialect =}\NormalTok{ connectionDetails}\OperatorTok{$}\NormalTok{dbms)} + \StringTok{"GROUP BY cohort\_definition\_id"}\NormalTok{)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{render}\NormalTok{(sql, }\AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} +\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translate}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)} - \KeywordTok{querySql}\NormalTok{(connection, sql)} + \FunctionTok{querySql}\NormalTok{(connection, sql)} \end{Highlighting} \end{Shaded} @@ -1587,25 +1557,25 @@ \subsubsection{Data extraction}\label{data-extraction-1}} Now we can tell \texttt{PatientLevelPrediction} to extract all necessary data for our analysis. This is done using the -\href{https://github.com/OHDSI/FeatureExtration}{\texttt{FeatureExtractionPackage}}. +\href{https://github.com/OHDSI/FeatureExtraction}{\texttt{FeatureExtractionPackage}}. In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g.~all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its -\href{https://github.com/OHDSI/FeatureExtration}{vignettes}. For our +\href{https://github.com/OHDSI/FeatureExtraction}{vignettes}. For our example study we decided to use these settings: \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ covariateSettings <-}\StringTok{ }\KeywordTok{createCovariateSettings}\NormalTok{(}\DataTypeTok{useDemographicsGender =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDemographicsAge =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useConditionGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useDrugGroupEraAnyTimePrior =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{useVisitConceptCountLongTerm =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{longTermStartDays =} \DecValTok{-365}\NormalTok{,} - \DataTypeTok{endDays =} \DecValTok{-1}\NormalTok{)} +\NormalTok{ covariateSettings }\OtherTok{\textless{}{-}} \FunctionTok{createCovariateSettings}\NormalTok{(}\AttributeTok{useDemographicsGender =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useDemographicsAge =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useConditionGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useConditionGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useDrugGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useDrugGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{useVisitConceptCountLongTerm =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{longTermStartDays =} \SpecialCharTok{{-}}\DecValTok{365}\NormalTok{,} + \AttributeTok{endDays =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)} \end{Highlighting} \end{Shaded} @@ -1619,17 +1589,26 @@ \subsubsection{Data extraction}\label{data-extraction-1}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ plpData <-}\StringTok{ }\KeywordTok{getPlpData}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{cohortTable =} \StringTok{'AceAngioCohort'}\NormalTok{,} - \DataTypeTok{cohortId =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{covariateSettings =}\NormalTok{ covariateSettings,} - \DataTypeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \DataTypeTok{outcomeTable =} \StringTok{'AceAngioCohort'}\NormalTok{,} - \DataTypeTok{outcomeIds =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{sampleSize =} \DecValTok{10000} -\NormalTok{ )} +\NormalTok{databaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{(} + \AttributeTok{connectionDetails =}\NormalTok{ connectionDetails,} + \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} + \AttributeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} + \AttributeTok{cohortTable =} \StringTok{\textquotesingle{}AceAngioCohort\textquotesingle{}}\NormalTok{,} + \AttributeTok{cohortId =} \DecValTok{1}\NormalTok{,} + \AttributeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} + \AttributeTok{outcomeTable =} \StringTok{\textquotesingle{}AceAngioCohort\textquotesingle{}}\NormalTok{,} + \AttributeTok{outcomeIds =} \DecValTok{2} +\NormalTok{ )} + +\NormalTok{restrictPlpDataSettings }\OtherTok{\textless{}{-}} \FunctionTok{createRestrictPlpDataSettings}\NormalTok{(} + \AttributeTok{sampleSize =} \DecValTok{10000} +\NormalTok{ )} + +\NormalTok{plpData }\OtherTok{\textless{}{-}} \FunctionTok{getPlpData}\NormalTok{(} + \AttributeTok{databaseDetails =}\NormalTok{ databaseDetails, } + \AttributeTok{covariateSettings =}\NormalTok{ covariateSettings, } + \AttributeTok{restrictPlpDataSettings =}\NormalTok{ restrictPlpDataSettings} +\NormalTok{ )} \end{Highlighting} \end{Shaded} @@ -1649,7 +1628,7 @@ \subsubsection{Data extraction}\label{data-extraction-1}} \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{savePlpData}\NormalTok{(plpData, }\StringTok{"angio_in_ace_data"}\NormalTok{)} + \FunctionTok{savePlpData}\NormalTok{(plpData, }\StringTok{"angio\_in\_ace\_data"}\NormalTok{)} \end{Highlighting} \end{Shaded} @@ -1682,24 +1661,138 @@ \subsubsection{Additional inclusion \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(}\DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{364}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{9999}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{addExposureDaysToStart =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{addExposureDaysToEnd =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includeAllOutcomes =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"DEBUG"} +\NormalTok{ populationSettings }\OtherTok{\textless{}{-}} \FunctionTok{createStudyPopulationSettings}\NormalTok{(} + \AttributeTok{washoutPeriod =} \DecValTok{364}\NormalTok{,} + \AttributeTok{firstExposureOnly =} \ConstantTok{FALSE}\NormalTok{,} + \AttributeTok{removeSubjectsWithPriorOutcome =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{priorOutcomeLookback =} \DecValTok{9999}\NormalTok{,} + \AttributeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} + \AttributeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{, } + \AttributeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} + \AttributeTok{startAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} + \AttributeTok{endAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} + \AttributeTok{requireTimeAtRisk =} \ConstantTok{TRUE}\NormalTok{,} + \AttributeTok{includeAllOutcomes =} \ConstantTok{TRUE} \NormalTok{ )} \end{Highlighting} \end{Shaded} +\hypertarget{spliting-the-data-into-trainingvalidationtesting-datasets-1}{% +\subsubsection{Spliting the data into training/validation/testing +datasets}\label{spliting-the-data-into-trainingvalidationtesting-datasets-1}} + +When developing a prediction model using supervised learning (when you +have features paired with labels for a set of patients), the first step +is to design the development/internal validation process. This requires +specifying how to select the model hyper-parameters, how to learn the +model parameters and how to fairly evaluate the model. In general, the +validation set is used to pick hyper-parameters, the training set is +used to learn the model parameters and the test set is used to perform +fair internal validation. However, cross-validation can be implemented +to pick the hyper-parameters on the training data (so a validation data +set is not required). Cross validation can also be used to estimate +internal validation (so a testing data set is not required). + +In small data the best approach for internal validation has been shown +to be boostrapping. However, in big data (many patients and many +features) bootstrapping is generally not feasible. In big data our +research has shown that it is just important to have some form of fair +evaluation (use a test set or cross validation). For full details see +\href{add\%20link}{our BMJ open paper}. + +In the PatientLevelPrediction package, the splitSettings define how the +plpData are partitioned into training/validation/testing data. Cross +validation is always done, but using a test set is optional (when the +data are small, it may be optimal to not use a test set). For the +splitSettings we can use the type (stratified/time/subject) and +testFraction parameters to split the data in a 75\%-25\% split and run +the patient-level prediction pipeline: + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ splitSettings }\OtherTok{\textless{}{-}} \FunctionTok{createDefaultSplitSetting}\NormalTok{(} + \AttributeTok{trainFraction =} \FloatTok{0.75}\NormalTok{,} + \AttributeTok{testFraction =} \FloatTok{0.25}\NormalTok{,} + \AttributeTok{type =} \StringTok{\textquotesingle{}stratified\textquotesingle{}}\NormalTok{,} + \AttributeTok{nfold =} \DecValTok{2}\NormalTok{, } + \AttributeTok{splitSeed =} \DecValTok{1234} +\NormalTok{ )} +\end{Highlighting} +\end{Shaded} + +Note: it is possible to add a custom method to specify how the plpData +are partitioned into training/validation/testing data, see +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf}{vignette +for custom splitting}. + +\hypertarget{preprocessing-the-training-data-1}{% +\subsubsection{Preprocessing the training +data}\label{preprocessing-the-training-data-1}} + +There a numerous data processing settings that a user must specify when +developing a prediction model. These are: * Whether to under-sample or +over-sample the training data (this may be useful when there is class +imballance (e.g., the outcome is very rare or very common)) * Whether to +perform feature engineering or feature selection (e.g., create latent +variables that are not observed in the data or reduce the dimensionality +of the data) * Whether to remove redundant features and normalize the +data (this is required for some models) + +The default sample settings does nothing, it simply returns the +trainData as input, see below: + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ sampleSettings }\OtherTok{\textless{}{-}} \FunctionTok{createSampleSettings}\NormalTok{()} +\end{Highlighting} +\end{Shaded} + +However, the current package contains methods of under-sampling the +non-outcome patients. To perform undersampling, the \texttt{type} input +should be `underSample' and \texttt{numberOutcomestoNonOutcomes} must be +specified (an integer specifying the number of non-outcomes per +outcome). It is possible to add any custom function for over/under +sampling, see +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf}{vignette +for custom sampling}. + +It is possible to specify a combination of feature engineering functions +that take as input the trainData and output a new trainData with +different features. The default feature engineering setting does +nothing: + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ featureEngineeringSettings }\OtherTok{\textless{}{-}} \FunctionTok{createFeatureEngineeringSettings}\NormalTok{()} +\end{Highlighting} +\end{Shaded} + +However, it is possible to add custom feature engineering functions into +the pipeline, see +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomfeatureEngineering.pdf}{vignette +for custom feature engineering}. + +Finally, the preprocessing setting is required. For this setting the +user can define \texttt{minFraction}, this removes any features that is +observed in the training data for less than 0.01 fraction of the +patients. So, if \texttt{minFraction\ =\ 0.01} then any feature that is +seen in less than 1 percent of the target population is removed. The +input \texttt{normalize} specifies whether the features are scaled +between 0 and 1, this is required for certain models (e.g., LASSO +logistic regression). The input \texttt{removeRedundancy} specifies +whether features that are observed in all of the target population are +removed. + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{ preprocessSettingsSettings }\OtherTok{\textless{}{-}} \FunctionTok{createPreprocessSettings}\NormalTok{(} + \AttributeTok{minFraction =} \FloatTok{0.01}\NormalTok{, } + \AttributeTok{normalize =}\NormalTok{ T, } + \AttributeTok{removeRedundancy =}\NormalTok{ T} +\NormalTok{ )} +\end{Highlighting} +\end{Shaded} + \hypertarget{model-development-1}{% \subsubsection{Model Development}\label{model-development-1}} @@ -1721,20 +1814,45 @@ \subsubsection{Model Development}\label{model-development-1}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{gbmModel <-}\StringTok{ }\KeywordTok{setGradientBoostingMachine}\NormalTok{(}\DataTypeTok{ntrees =} \DecValTok{5000}\NormalTok{, }\DataTypeTok{maxDepth =} \KeywordTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{10}\NormalTok{), }\DataTypeTok{learnRate =} \KeywordTok{c}\NormalTok{(}\FloatTok{0.001}\NormalTok{, } - \FloatTok{0.01}\NormalTok{, }\FloatTok{0.1}\NormalTok{, }\FloatTok{0.9}\NormalTok{))} +\NormalTok{ gbmModel }\OtherTok{\textless{}{-}} \FunctionTok{setGradientBoostingMachine}\NormalTok{(} + \AttributeTok{ntrees =} \DecValTok{5000}\NormalTok{, } + \AttributeTok{maxDepth =} \FunctionTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{,}\DecValTok{7}\NormalTok{,}\DecValTok{10}\NormalTok{), } + \AttributeTok{learnRate =} \FunctionTok{c}\NormalTok{(}\FloatTok{0.001}\NormalTok{,}\FloatTok{0.01}\NormalTok{,}\FloatTok{0.1}\NormalTok{,}\FloatTok{0.9}\NormalTok{)} +\NormalTok{ )} \end{Highlighting} \end{Shaded} -The \texttt{runPlP} function uses the population, \texttt{plpData}, and -model settings to train and evaluate the model. We can use the testSplit -(person/time) and testFraction parameters to split the data in a -75\%-25\% split and run the patient-level prediction pipeline: +The \texttt{runPlP} function requires the \texttt{plpData}, the +\texttt{outcomeId} specifying the outcome being predicted and the +settings: \texttt{populationSettings}, \texttt{splitSettings}, +\texttt{sampleSettings}, \texttt{featureEngineeringSettings}, +\texttt{preprocessSettings} and \texttt{modelSettings} to train and +evaluate the model. \begin{Shaded} \begin{Highlighting}[] -\NormalTok{ gbmResults <-}\StringTok{ }\KeywordTok{runPlp}\NormalTok{(population, plpData, }\DataTypeTok{modelSettings =}\NormalTok{ gbmModel, }\DataTypeTok{testSplit=}\StringTok{'stratified'}\NormalTok{, } - \DataTypeTok{testFraction=}\FloatTok{0.25}\NormalTok{, }\DataTypeTok{nfold=}\DecValTok{2}\NormalTok{, }\DataTypeTok{splitSeed =} \DecValTok{1234}\NormalTok{)} +\NormalTok{ gbmResults }\OtherTok{\textless{}{-}} \FunctionTok{runPlp}\NormalTok{(} + \AttributeTok{plpData =}\NormalTok{ plpData,} + \AttributeTok{outcomeId =} \DecValTok{2}\NormalTok{, } + \AttributeTok{analysisId =} \StringTok{\textquotesingle{}singleDemo2\textquotesingle{}}\NormalTok{,} + \AttributeTok{analysisName =} \StringTok{\textquotesingle{}Demonstration of runPlp for training single PLP models\textquotesingle{}}\NormalTok{,} + \AttributeTok{populationSettings =}\NormalTok{ populationSettings, } + \AttributeTok{splitSettings =}\NormalTok{ splitSettings,} + \AttributeTok{sampleSettings =}\NormalTok{ sampleSettings, } + \AttributeTok{featureEngineeringSettings =}\NormalTok{ featureEngineeringSettings, } + \AttributeTok{preprocessSettings =}\NormalTok{ preprocessSettings,} + \AttributeTok{modelSettings =}\NormalTok{ gbmModel,} + \AttributeTok{logSettings =} \FunctionTok{createLogSettings}\NormalTok{(), } + \AttributeTok{executeSettings =} \FunctionTok{createExecuteSettings}\NormalTok{(} + \AttributeTok{runSplitData =}\NormalTok{ T, } + \AttributeTok{runSampleData =}\NormalTok{ T, } + \AttributeTok{runfeatureEngineering =}\NormalTok{ T, } + \AttributeTok{runPreprocessData =}\NormalTok{ T, } + \AttributeTok{runModelDevelopment =}\NormalTok{ T, } + \AttributeTok{runCovariateSummary =}\NormalTok{ T} +\NormalTok{ ), } + \AttributeTok{saveDirectory =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{\textquotesingle{}singlePlpExample2\textquotesingle{}}\NormalTok{)} +\NormalTok{ )} \end{Highlighting} \end{Shaded} @@ -1743,15 +1861,11 @@ \subsubsection{Model Development}\label{model-development-1}} the model on the remaining 25\%. A results data structure is returned containing information about the model, its performance etc. -In the runPlp function there are several parameters to save the plpData, -plpResults, plpPlots, evaluation etc. which are all set to True by -default. However, there is also some functionality to this manually. - You can save the model using: \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{savePlpModel}\NormalTok{(gbmResults}\OperatorTok{$}\NormalTok{model, }\DataTypeTok{dirPath =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} + \FunctionTok{savePlpModel}\NormalTok{(gbmResults}\SpecialCharTok{$}\NormalTok{model, }\AttributeTok{dirPath =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1759,7 +1873,7 @@ \subsubsection{Model Development}\label{model-development-1}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} +\NormalTok{ plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1767,7 +1881,7 @@ \subsubsection{Model Development}\label{model-development-1}} \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{savePlpResult}\NormalTok{(gbmResults, }\DataTypeTok{location =} \KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"gbm"}\NormalTok{))} + \FunctionTok{savePlpResult}\NormalTok{(gbmResults, }\AttributeTok{location =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}gbm\textquotesingle{}}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1775,7 +1889,7 @@ \subsubsection{Model Development}\label{model-development-1}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{gbmResults <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(), }\StringTok{"gbm"}\NormalTok{))} +\NormalTok{ gbmResults }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpResult}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}gbm\textquotesingle{}}\NormalTok{))} \end{Highlighting} \end{Shaded} @@ -1799,24 +1913,24 @@ \section{Study package creation}\label{study-package-creation}} Create a new prediction study and select your target and outcome cohorts. - \includegraphics{atlasplp1.png} + \includegraphics{atlasplp1.webp} \item Specify one or more analysis settings. - \includegraphics{atlasplp2.png} + \includegraphics{atlasplp2.web} \newpage \item Specify the trainings settigns - \includegraphics{atlasplp3.png} + \includegraphics{atlasplp3.webp} \item Specify the execution settings - \includegraphics{atlasplp4.png} -\end{enumerate} + \includegraphics{atlasplp4.web} -\newpage + {]} \newpage +\end{enumerate} ATLAS can build a R package for you that will execute the full study against you CDM. Below the steps are explained how to do this in ATLAS. @@ -1829,7 +1943,7 @@ \section{Study package creation}\label{study-package-creation}} \begin{figure} \centering - \includegraphics{atlasdownload1.png} + \includegraphics{atlasdownload1.webp} \caption{R package download functionality in ATLAS} \end{figure} \item @@ -1838,7 +1952,7 @@ \section{Study package creation}\label{study-package-creation}} \begin{figure} \centering - \includegraphics{atlasdownload2.png} + \includegraphics{atlasdownload2.webp} \caption{R package download functionality in ATLAS} \end{figure} \item @@ -1862,20 +1976,28 @@ \section{Internal validation}\label{internal-validation}} your browser in which you can view all performance measures created by the framework as shown in the figure below. -\includegraphics{shinysummary.png} +\begin{figure} +\centering +\includegraphics{shinysummary.webp} +\caption{Summary of all the performance measures of the analyses} +\end{figure} Furthermore, many interactive plots are available in the Shiny App, for example the ROC curve in which you can move over the plot to see the threshold and the corresponding sensitivity and specificity values. -\includegraphics{shinyroc.png} +\begin{figure} +\centering +\includegraphics{shinyroc.webp} +\caption{Example of the interactive ROC curve} +\end{figure} To generate and save all the evaluation plots to a folder run the following code: \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{plotPlp}\NormalTok{(lrResults, }\DataTypeTok{dirPath =} \KeywordTok{getwd}\NormalTok{())} +\FunctionTok{plotPlp}\NormalTok{(lrResults, }\AttributeTok{dirPath=}\FunctionTok{getwd}\NormalTok{())} \end{Highlighting} \end{Shaded} @@ -1894,9 +2016,15 @@ \subsection{Discrimination}\label{discrimination}} plot the better the discrimination of the model. The plot is created by changing the probability threshold to assign the positive class. -\includegraphics{sparseRoc.png} +\begin{figure} +\centering +\includegraphics{sparseRoc.webp} +\caption{Receiver Operating Characteristic Plot} +\end{figure} + +\newpage -\newpage \#\# Calibration +\#\# Calibration The calibration plot shows how close the predicted risk is to the observed risk. The diagonal dashed line thus indicates a perfectly @@ -1908,7 +2036,11 @@ \subsection{Discrimination}\label{discrimination}} represented the 95\% lower and upper confidence intervals of the slope of the fitted line. -\includegraphics{sparseCalibration.png} +\begin{figure} +\centering +\includegraphics{sparseCalibration.webp} +\caption{Calibration Plot} +\end{figure} \newpage @@ -1932,7 +2064,7 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{plotSmoothCalibration}\NormalTok{(lrResults)} +\FunctionTok{plotSmoothCalibration}\NormalTok{(lrResults)} \end{Highlighting} \end{Shaded} @@ -1944,17 +2076,29 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} not highlight the miss-calibration at the lower predicted probability levels that well. +\begin{figure} +\centering \includegraphics{smoothCalibration.jpeg} +\caption{Smooth Calibration plot} +\end{figure} + +\newpage -\newpage \#\# Preference distribution +\#\# Preference distribution The preference distribution plots are the preference score distributions corresponding to i) people in the test set with the outcome (red) and ii) people in the test set without the outcome (blue). -\includegraphics{preferencePDF.png} +\begin{figure} +\centering +\includegraphics{preferencePDF.webp} +\caption{Preference Plot} +\end{figure} -\newpage \#\# Predicted probability distribution +\newpage + +\#\# Predicted probability distribution The prediction distribution box plots are for the predicted risks of the people in the test set with the outcome (class 1: blue) and without the @@ -1965,9 +2109,15 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} overlap between the two distribution which lead to an imperfect discrimination. -\includegraphics{predictionDistribution.png} +\begin{figure} +\centering +\includegraphics{predictionDistribution.wwebp} +\caption{Prediction Distribution Box Plot} +\end{figure} + +\newpage -\newpage \#\# Test-Train similarity +\#\# Test-Train similarity The test-train similarity is assessed by plotting the mean covariate values in the train set against those in the test set for people with @@ -1976,9 +2126,15 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} The results for our example of look very promising since the mean values of the covariates are on the diagonal. -\includegraphics{generalizability.png} +\begin{figure} +\centering +\includegraphics{generalizability.webp} +\caption{Similarity plots of train and test set} +\end{figure} -\newpage \#\# Variable scatter plot +\newpage + +\#\# Variable scatter plot The variable scatter plot shows the mean covariate value for the people with the outcome against the mean covariate value for the people without @@ -1990,16 +2146,22 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} The plot shows that the mean of most of the covariates is higher for subjects with the outcome compared to those without. -\includegraphics{variableScatterplot.png} +\begin{figure} +\centering +\includegraphics{variableScatterplot.webp} +\caption{Variabel scatter Plot} +\end{figure} + +\newpage -\newpage \#\# Precision recall +\#\# Precision recall Precision (P) is defined as the number of true positives (Tp) over the number of true positives plus the number of false positives (Fp). \begin{Shaded} \begin{Highlighting}[] -\NormalTok{P <-}\StringTok{ }\NormalTok{Tp}\OperatorTok{/}\NormalTok{(Tp }\OperatorTok{+}\StringTok{ }\NormalTok{Fp)} +\NormalTok{P }\OtherTok{\textless{}{-}}\NormalTok{ Tp}\SpecialCharTok{/}\NormalTok{(Tp}\SpecialCharTok{+}\NormalTok{Fp)} \end{Highlighting} \end{Shaded} @@ -2008,7 +2170,7 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{R <-}\StringTok{ }\NormalTok{Tp}\OperatorTok{/}\NormalTok{(Tp }\OperatorTok{+}\StringTok{ }\NormalTok{Fn)} +\NormalTok{R }\OtherTok{\textless{}{-}}\NormalTok{ Tp}\SpecialCharTok{/}\NormalTok{(Tp }\SpecialCharTok{+}\NormalTok{ Fn)} \end{Highlighting} \end{Shaded} @@ -2017,7 +2179,7 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} \begin{Shaded} \begin{Highlighting}[] -\NormalTok{F1 <-}\StringTok{ }\DecValTok{2} \OperatorTok{*}\StringTok{ }\NormalTok{P }\OperatorTok{*}\StringTok{ }\NormalTok{R}\OperatorTok{/}\NormalTok{(P }\OperatorTok{+}\StringTok{ }\NormalTok{R)} +\NormalTok{F1 }\OtherTok{\textless{}{-}} \DecValTok{2}\SpecialCharTok{*}\NormalTok{P}\SpecialCharTok{*}\NormalTok{R}\SpecialCharTok{/}\NormalTok{(P}\SpecialCharTok{+}\NormalTok{R)} \end{Highlighting} \end{Shaded} @@ -2035,9 +2197,15 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} It is also possible that lowering the threshold may leave recall unchanged, while the precision fluctuates. -\includegraphics{precisionRecall.png} +\begin{figure} +\centering +\includegraphics{precisionRecall.webp} +\caption{Precision Recall Plot} +\end{figure} + +\newpage -\newpage \#\# Demographic summary +\#\# Demographic summary This plot shows for females and males the expected and observed risk in different age groups together with a confidence area. @@ -2045,60 +2213,36 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} The results show that our model is well calibrated across gender and age groups. -\includegraphics{demographicSummary.png} +\begin{figure} +\centering +\includegraphics{demographicSummary.webp} +\caption{Demographic Summary Plot} +\end{figure} -\newpage \# External validation +\newpage + +\# External validation We recommend to always perform external validation, i.e.~apply the final model on as much new datasets as feasible and evaluate its performance. \begin{Shaded} \begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpModel <-}\StringTok{ }\KeywordTok{loadPlpModel}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'model'}\NormalTok{)} - -\CommentTok{#load the new plpData and create the population} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{loadPlpData}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'data'}\NormalTok{)} -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(plpData, } - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{, } - \DataTypeTok{includeAllOutcomes =} \OtherTok{TRUE}\NormalTok{, } - \DataTypeTok{firstExposureOnly =} \OtherTok{TRUE}\NormalTok{, } - \DataTypeTok{washoutPeriod =} \DecValTok{365}\NormalTok{, } - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{TRUE}\NormalTok{, } - \DataTypeTok{priorOutcomeLookback =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365} -\NormalTok{)} - -\CommentTok{# apply the trained model on the new data} -\NormalTok{validationResults <-}\StringTok{ }\KeywordTok{applyModel}\NormalTok{(population,plpData,plpModel)} -\end{Highlighting} -\end{Shaded} - -To make things easier we also provide a function for performing external -validation of a model across one or multiple datasets: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'plpResult'}\NormalTok{)} - -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{validation <-}\StringTok{ }\KeywordTok{externalValidatePlp}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, } - \DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{validationSchemaTarget =} \StringTok{'new_cohort_schema'}\NormalTok{,} - \DataTypeTok{validationSchemaOutcome =} \StringTok{'new_cohort_schema'}\NormalTok{,} - \DataTypeTok{validationSchemaCdm =} \StringTok{'new_cdm_schema'}\NormalTok{, } - \DataTypeTok{validationTableTarget =} \StringTok{'cohort_table'}\NormalTok{,} - \DataTypeTok{validationTableOutcome =} \StringTok{'cohort_table'}\NormalTok{, } - \DataTypeTok{validationIdTarget =} \StringTok{'cohort_id'}\NormalTok{, } - \DataTypeTok{validationIdOutcome =} \StringTok{'outcome_id'}\NormalTok{, } - \DataTypeTok{keepPrediction =}\NormalTok{ T} +\CommentTok{\# load the trained model} +\NormalTok{plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{)} + +\CommentTok{\# add details of new database} +\NormalTok{validationDatabaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{()} + +\CommentTok{\# to externally validate the model and perform recalibration run:} +\FunctionTok{externalValidateDbPlp}\NormalTok{(} + \AttributeTok{plpModel =}\NormalTok{ plpModel,} + \AttributeTok{validationDatabaseDetails =}\NormalTok{ validationDatabaseDetails,} + \AttributeTok{validationRestrictPlpDataSettings =}\NormalTok{ plpModel}\SpecialCharTok{$}\NormalTok{settings}\SpecialCharTok{$}\NormalTok{plpDataSettings,} + \AttributeTok{settings =} \FunctionTok{createValidationSettings}\NormalTok{(} + \AttributeTok{recalibrate =} \StringTok{\textquotesingle{}weakRecalibration\textquotesingle{}} +\NormalTok{ ),} + \AttributeTok{outputFolder =} \FunctionTok{getwd}\NormalTok{()} \NormalTok{)} \end{Highlighting} \end{Shaded} @@ -2106,75 +2250,12 @@ \subsection{Smooth Calibration}\label{smooth-calibration}} This will extract the new plpData from the specified schemas and cohort tables. It will then apply the same population settings and the trained plp model. Finally, it will evaluate the performance and return the -standard output as \texttt{validation\$performance} and if -keepPrediction is TRUE then it will also return the prediction on the -population as \texttt{validation\$prediction}. They can be inserted into -the shiny app for viewing the model and validation by running: +standard output as \texttt{validation\$performanceEvaluation} and it +will also return the prediction on the population as +\texttt{validation\$prediction}. They can be inserted into the shiny app +for viewing the model and validation by running: \texttt{viewPlp(runPlp=plpResult,\ validatePlp=validation\ )}. -If you want to validate on multiple databases available you can insert -the new schemas and cohort tables as a list: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# load the trained model} -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{getwd}\NormalTok{(),}\StringTok{'plpResult'}\NormalTok{)} - -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \DataTypeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \DataTypeTok{user =} \StringTok{"joe"}\NormalTok{, } - \DataTypeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{validation <-}\StringTok{ }\KeywordTok{externalValidatePlp}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, } - \DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{validationSchemaTarget =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_schema1'}\NormalTok{,} - \StringTok{'new_cohort_schema2'}\NormalTok{),} - \DataTypeTok{validationSchemaOutcome =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_schema1'}\NormalTok{,} - \StringTok{'new_cohort_schema2'}\NormalTok{),} - \DataTypeTok{validationSchemaCdm =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cdm_schema1'}\NormalTok{,} - \StringTok{'new_cdm_schema2'}\NormalTok{), } - \DataTypeTok{validationTableTarget =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_table1'}\NormalTok{,} - \StringTok{'new_cohort_table2'}\NormalTok{),} - \DataTypeTok{validationTableOutcome =} \KeywordTok{list}\NormalTok{(}\StringTok{'new_cohort_table1'}\NormalTok{,} - \StringTok{'new_cohort_table2'}\NormalTok{),} - \DataTypeTok{validationIdTarget =} \StringTok{'cohort_id'}\NormalTok{, } - \DataTypeTok{validationIdOutcome =} \StringTok{'outcome_id'}\NormalTok{, } - \DataTypeTok{keepPrediction =}\NormalTok{ T} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{journal-paper-generation}{% -\section{Journal paper generation}\label{journal-paper-generation}} - -We have added functionality to automatically generate a word document -you can use as start of a journal paper. It contains many of the -generated study details and results. If you have performed external -validation these results will can be added as well. Optionally, you can -add a ``Table 1'' that contains data on many covariates for the target -population. - -You can create the draft journal paper by running this function: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{createPlpJournalDocument}\NormalTok{(}\DataTypeTok{plpResult =} \OperatorTok{<}\NormalTok{your plp results}\OperatorTok{>}\NormalTok{, } - \DataTypeTok{plpValidation =} \OperatorTok{<}\NormalTok{your validation results}\OperatorTok{>}\NormalTok{,} - \DataTypeTok{plpData =} \OperatorTok{<}\NormalTok{your plp data}\OperatorTok{>}\NormalTok{, } - \DataTypeTok{targetName =} \StringTok{""}\NormalTok{,} - \DataTypeTok{outcomeName =} \StringTok{""}\NormalTok{, } - \DataTypeTok{table1 =}\NormalTok{ F, } - \DataTypeTok{connectionDetails =} \OtherTok{NULL}\NormalTok{,} - \DataTypeTok{includeTrain =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{includeTest =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includePredictionPicture =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{includeAttritionPlot =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{outputLocation =} \StringTok{""}\NormalTok{)}\ErrorTok{)} -\end{Highlighting} -\end{Shaded} - -For more details see the help page of the function. - \newpage \hypertarget{other-functionality}{% @@ -2184,69 +2265,49 @@ \section{Other functionality}\label{other-functionality}} and contributions have been made my many persons in the OHDSI community. The table below provides an overview: -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.18\columnwidth}\raggedright -Functionality\strut -\end{minipage} & \begin{minipage}[b]{0.55\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.18\columnwidth}\raggedright -Vignette\strut -\end{minipage}\tabularnewline -\midrule +\begin{longtable}[]{@{} + >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2361}} + >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.5278}} + >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2361}}@{}} +\toprule\noalign{} +\begin{minipage}[b]{\linewidth}\raggedright +Functionality +\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright +Description +\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright +Vignette +\end{minipage} \\ +\midrule\noalign{} \endhead -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Builing Multiple Models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can run multiple models -automatically\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingMultiplePredictiveModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Custom algorithms\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can add your own custom algorithms in -the framework\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomAlgorithms.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Ensemble models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can use the framework to build ensemble -models, i.e combine multiple models in a super learner\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingEnsembleModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Deep Learning Models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -We have added extensive functionality for Deep Learning including -several architectures in both pyTorch and Keras. These algorithms can be -trained using GPU power\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingDeepLearningModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Learning curves\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -Learning curves assess the effect of training set size on model -performance by training a sequence of prediction models on successively -larger subsets of the training set. A learning curve plot can also help -in diagnosing a bias or variance problem as explained below.\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/GeneratingLearningCurves.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.18\columnwidth}\raggedright -Implementing existing models\strut -\end{minipage} & \begin{minipage}[t]{0.55\columnwidth}\raggedright -This vignette describes how you can implement existing logistic -regression models in the framework, e.g.~as found in literature\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\raggedright -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/ImplementingExistingModels.pdf}{\texttt{Vignette}}\strut -\end{minipage}\tabularnewline -\bottomrule +\bottomrule\noalign{} +\endlastfoot +Builing Multiple Models & This vignette describes how you can run +multiple models automatically & +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf}{\texttt{Vignette}} \\ +Custom Models & This vignette describes how you can add your own custom +algorithms in the framework & +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf}{\texttt{Vignette}} \\ +Custom Splitting Functions & This vignette describes how you can add +your own custom training/validation/testing splitting functions in the +framework & +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf}{\texttt{Vignette}} \\ +Custom Sampling Functions & This vignette describes how you can add your +own custom sampling functions in the framework & +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf}{\texttt{Vignette}} \\ +Custom Feature Engineering/Selection & This vignette describes how you +can add your own custom feature engineering and selection functions in +the framework & +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf}{\texttt{Vignette}} \\ +Ensemble models & This vignette describes how you can use the framework +to build ensemble models, i.e combine multiple models in a super learner +& +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf}{\texttt{Vignette}} \\ +Learning curves & Learning curves assess the effect of training set size +on model performance by training a sequence of prediction models on +successively larger subsets of the training set. A learning curve plot +can also help in diagnosing a bias or variance problem as explained +below. & +\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/GeneratingLearningCurves.pdf}{\texttt{Vignette}} \\ \end{longtable} \hypertarget{demos}{% @@ -2256,11 +2317,11 @@ \section{Demos}\label{demos}} \begin{Shaded} \begin{Highlighting}[] -\CommentTok{# Show all demos in our package: } -\KeywordTok{demo}\NormalTok{(}\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} +\CommentTok{\# Show all demos in our package: } +\FunctionTok{demo}\NormalTok{(}\AttributeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\CommentTok{# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call} -\KeywordTok{demo}\NormalTok{(}\StringTok{"SingleModelDemo"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} +\CommentTok{\# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call} +\FunctionTok{demo}\NormalTok{(}\StringTok{"SingleModelDemo"}\NormalTok{, }\AttributeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} \end{Highlighting} \end{Shaded} @@ -2274,19 +2335,18 @@ \section{Acknowledgments}\label{acknowledgments}} \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} +\FunctionTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} \end{Highlighting} \end{Shaded} \begin{verbatim} -## ## To cite PatientLevelPrediction in publications use: ## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . +## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and +## implementation of a standardized framework to generate and evaluate patient-level +## prediction models using observational healthcare data." _Journal of the American +## Medical Informatics Association_, *25*(8), 969-975. +## . ## ## A BibTeX entry for LaTeX users is ## @@ -2307,18 +2367,17 @@ \section{Acknowledgments}\label{acknowledgments}} \begin{Shaded} \begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"Cyclops"}\NormalTok{)} +\FunctionTok{citation}\NormalTok{(}\StringTok{"Cyclops"}\NormalTok{)} \end{Highlighting} \end{Shaded} \begin{verbatim} -## ## To cite Cyclops in publications use: ## -## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive -## parallelization of serial inference algorithms for complex generalized linear -## models." _ACM Transactions on Modeling and Computer Simulation_, *23*, 10. . +## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive parallelization of +## serial inference algorithms for complex generalized linear models." _ACM Transactions +## on Modeling and Computer Simulation_, *23*, 10. +## . ## ## A BibTeX entry for LaTeX users is ## @@ -2329,7 +2388,7 @@ \section{Acknowledgments}\label{acknowledgments}} ## volume = {23}, ## pages = {10}, ## year = {2013}, -## url = {http://dl.acm.org/citation.cfm?id=2414791}, +## url = {https://dl.acm.org/doi/10.1145/2414416.2414791}, ## } \end{verbatim} @@ -2365,20 +2424,23 @@ \section*{Appendix 1: Study population settings \item Require minimum time-at-risk for all person in the target cohort - \includegraphics{popdef1.png} + \includegraphics{popdef1.webp} \item Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk. - \includegraphics{popdef2.png} + \includegraphics{popdef2.webp} \end{enumerate} -\newpage 3) +\newpage +3 + +) Include all persons in the target cohort exclude persons with prior outcomes -\includegraphics{popdef3.png} +\includegraphics{popdef3.webp} \begin{enumerate} \def\labelenumi{\arabic{enumi})} @@ -2387,23 +2449,25 @@ \section*{Appendix 1: Study population settings Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes - \includegraphics{popdef4.png} + \includegraphics{popdef4.webp} \end{enumerate} \newpage +5 + +) + +Include all persons in target cohort exclude persons with prior outcomes + +\includegraphics{popdef5.webp} \begin{enumerate} \def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{4} -\item - Include all persons in target cohort exclude persons with prior - outcomes - - \includegraphics{popdef5.png} +\setcounter{enumi}{5} \item Include all persons in target cohort - \includegraphics{popdef6.png} + \includegraphics{popdef6.webp} \end{enumerate} \end{document} diff --git a/inst/doc/CreatingLearningCurves.tex b/inst/doc/CreatingLearningCurves.tex deleted file mode 100644 index 79283a598..000000000 --- a/inst/doc/CreatingLearningCurves.tex +++ /dev/null @@ -1,408 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Creating Learning Curves}, - pdfauthor={Luis H. John, Jenna M. Reps, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{-\maxdimen} % remove section numbering -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Generating Learning Curves} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 4.0.6} - -\title{Creating Learning Curves} -\author{Luis H. John, Jenna M. Reps, Peter R. Rijnbeek} -\date{2020-08-19} - -\begin{document} -\maketitle - -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -This vignette describes how you can use the Observational Health Data -Sciences and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to create learning curves. This vignette assumes you have read -and are comfortable with building patient level prediction models as -described in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -Prediction models will show overly-optimistic performance when -predicting on the same data as used for training. Therefore, -best-practice is to partition our data into a training set and testing -set. We then train our prediction model on the training set portion and -asses its ability to generalize to unseen data by measuring its -performance on the testing set. - -Learning curves assess the effect of training set size on model -performance by training a sequence of prediction models on successively -larger subsets of the training set. A learning curve plot can also help -in diagnosing a bias or variance problem as explained below. - -\begin{figure} -\centering -\includegraphics{learningCurve.png} -\caption{Learning curve example.} -\end{figure} - -Figure 1, shows an example of learning curve plot in which the vertical -axis represents the model performance and the horizontal axis the -training set size. If training set size is small, the performance on the -training set is high, because a model can often be fitted well to a -limited number of training examples. At the same time, the performance -on the testing set will be poor, because the model trained on such a -limited number of training examples will not generalize well to unseen -data in the testing set. As the training set size increases, the -performance of the model on the training set will decrease. It becomes -more difficult for the model to find a good fit through all the training -examples. Also, the model will be trained on a more representative -portion of training examples, making it generalize better to unseen -data. This can be observed by the increasin testing set performance. - -The learning curve can help us in diagnosing bias and variance problems -with our classifier which will provide guidance on how to further -improve our model. We can observe high variance (overfitting) in a -prediction model if it performs well on the training set, but poorly on -the testing set (Figure 2). Adding additional data is a common approach -to counteract high variance. From the learning curve it becomes -apparent, that adding additional data may improve performance on the -testing set a little further, as the learning curve has not yet -plateaued and, thus, the model is not saturated yet. Therefore, adding -more data will decrease the gap between training set and testing set, -which is the main indicator for a high variance problem. - -\begin{figure} -\centering -\includegraphics{learningCurveVariance.png} -\caption{Prediction model suffering from high variance.} -\end{figure} - -Furthermore, we can observe high bias (underfitting) if a prediction -model performs poorly on the training set as well as on the testing set -(Figure 3). The learning curves of training set and testing set have -flattened on a low performance with only a small gap in between them. -Adding additional data will in this case have little to no impact on the -model performance. Choosing another prediction algorithm that can find -more complex (for example non-linear) relationships in the data may be -an alternative approach to consider in this high bias situation. - -\begin{figure} -\centering -\includegraphics{learningCurveBias.png} -\caption{Prediction model suffering from high bias.} -\end{figure} - -\hypertarget{creating-the-learning-curve}{% -\section{Creating the learning -curve}\label{creating-the-learning-curve}} - -Use the -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package to create a \texttt{population} and \texttt{plpData} object. -Alternatively, you can make use of the data simulator. The following -code snippet creates a population of 12000 patients. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)} -\KeywordTok{data}\NormalTok{(plpDataSimulationProfile)} -\NormalTok{sampleSize <-}\StringTok{ }\DecValTok{12000} -\NormalTok{plpData <-}\StringTok{ }\KeywordTok{simulatePlpData}\NormalTok{(} -\NormalTok{ plpDataSimulationProfile,} - \DataTypeTok{n =}\NormalTok{ sampleSize} -\NormalTok{)} - -\NormalTok{population <-}\StringTok{ }\KeywordTok{createStudyPopulation}\NormalTok{(} -\NormalTok{ plpData,} - \DataTypeTok{outcomeId =} \DecValTok{2}\NormalTok{,} - \DataTypeTok{binary =} \OtherTok{TRUE}\NormalTok{,} - \DataTypeTok{firstExposureOnly =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{washoutPeriod =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{removeSubjectsWithPriorOutcome =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{priorOutcomeLookback =} \DecValTok{99999}\NormalTok{,} - \DataTypeTok{requireTimeAtRisk =} \OtherTok{FALSE}\NormalTok{,} - \DataTypeTok{minTimeAtRisk =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowStart =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"INFO"} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Specify the prediction algorithm to be used. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use LASSO logistic regression} -\NormalTok{modelSettings <-}\StringTok{ }\KeywordTok{setLassoLogisticRegression}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -Specify a test fraction and a sequence of training set fractions. -Alternatively, you can provide a sequence of training events instead of -the training set fractions. This is recommended, because events are -determinant of model performance. Make sure that your training set can -provide the number of events specified. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{testFraction <-}\StringTok{ }\FloatTok{0.2} -\NormalTok{trainFractions <-}\StringTok{ }\KeywordTok{seq}\NormalTok{(}\FloatTok{0.1}\NormalTok{, }\FloatTok{0.8}\NormalTok{, }\FloatTok{0.1}\NormalTok{) }\CommentTok{# Create eight training set fractions} - -\CommentTok{# alternatively use a sequence of training events by uncommenting the line below.} -\CommentTok{# trainEvents <- seq(100, 5000, 100)} -\end{Highlighting} -\end{Shaded} - -Specify the test split to be used. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Use a split by person, alternatively a time split is possible} -\NormalTok{testSplit <-}\StringTok{ 'stratified'} -\end{Highlighting} -\end{Shaded} - -Create the learning curve object. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{learningCurve <-}\StringTok{ }\KeywordTok{createLearningCurve}\NormalTok{(population,} - \DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{modelSettings =}\NormalTok{ modelSettings,} - \DataTypeTok{testFraction =} \FloatTok{0.2}\NormalTok{,} - \DataTypeTok{verbosity =} \StringTok{"TRACE"}\NormalTok{,} - \DataTypeTok{trainFractions =}\NormalTok{ trainFractions,} - \CommentTok{# trainEvents = trainEvents,} - \DataTypeTok{splitSeed =} \DecValTok{1000}\NormalTok{,} - \DataTypeTok{saveModel =} \OtherTok{TRUE}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Plot the learning curve object (Figure 4). Specify one of the available -metrics: \texttt{AUROC}, \texttt{AUPRC}, \texttt{sBrier}. Moreover, you -can specify what metric to put on the abscissa, number of -\texttt{observations} or number of \texttt{events}. We recommend the -latter, because \texttt{events} are determinant of model performance and -allow you to better compare learning curves across different prediction -problems and databases. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{plotLearningCurve}\NormalTok{(} -\NormalTok{ learningCurve,} - \DataTypeTok{metric =} \StringTok{'AUROC'}\NormalTok{,} - \DataTypeTok{abscissa =} \StringTok{'events'}\NormalTok{,} - \DataTypeTok{plotTitle =} \StringTok{'Learning Curve'}\NormalTok{,} - \DataTypeTok{plotSubtitle =} \StringTok{'AUROC performance'} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\centering -\includegraphics{learningCurvePlot.png} -\caption{Learning curve plot.} -\end{figure} - -\hypertarget{parallel-processing}{% -\section{Parallel processing}\label{parallel-processing}} - -The learning curve object can be created in parallel, which can reduce -computation time significantly. Currently this functionality is only -available for LASSO logistic regression and gradient boosting machines. -Depending on the number of parallel workers it may require a significant -amount of memory. We advise to use the parallelized learning curve -function for parameter search and exploratory data analysis. - -Use the parallelized version of the learning curve function to create -the learning curve object in parallel. R will find the number of -available processing cores automatically and register the required -parallel backend. Alternatively, you can provide the number of cores you -wish to use. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{learningCurvePar <-}\StringTok{ }\KeywordTok{createLearningCurvePar}\NormalTok{(} -\NormalTok{ population,} - \DataTypeTok{plpData =}\NormalTok{ plpData,} - \DataTypeTok{modelSettings =}\NormalTok{ modelSettings,} - \DataTypeTok{testSplit =}\NormalTok{ testSplit,} - \DataTypeTok{testFraction =}\NormalTok{ testFraction,} - \DataTypeTok{trainEvents =}\NormalTok{ trainEvents,} - \DataTypeTok{cores =} \DecValTok{4}\NormalTok{,} - \DataTypeTok{splitSeed =} \DecValTok{1000} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{demo}{% -\section{Demo}\label{demo}} - -We have added a demo of the learningcurve: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{# Show all demos in our package: } - \KeywordTok{demo}\NormalTok{(}\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} - -\CommentTok{# Run the learning curve} - \KeywordTok{demo}\NormalTok{(}\StringTok{"LearningCurveDemo"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Do note that running this demo can take a considerable amount of time -(15 min on Quad core running in parallel)! - -\hypertarget{publication}{% -\section{Publication}\label{publication}} - -A publication titled `How little data do we need for patient-level -prediction?' uses the learning curve functionality in this package and -can be accessed as preprint in the arXiv archives at -\url{https://arxiv.org/abs/2008.07361}. - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and implementation of a standardized framework -## to generate and evaluate patient-level prediction models using observational healthcare data." _Journal of the -## American Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/CreatingNetworkstudies.tex b/inst/doc/CreatingNetworkstudies.tex deleted file mode 100644 index 62ec2f7e2..000000000 --- a/inst/doc/CreatingNetworkstudies.tex +++ /dev/null @@ -1,420 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Making patient-level predictive network study packages}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs} -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} - -\title{Making patient-level predictive network study packages} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\newpage - -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -The OHDSI Patient Level Prediction (PLP) package provides the framework -to implement prediction models at scale. This can range from developing -a large number of models across sites (methodology and study design -insight) to extensive external validation of existing models in the -OHDSI PLP framework (model insight). This vignette describes how you can -use the \texttt{PatientLevelPrediction} package to create a network -study package. - -\hypertarget{running-a-network-study-process}{% -\section{Running a Network study -Process}\label{running-a-network-study-process}} - -\hypertarget{step-1-developing-the-study}{% -\subsection{Step 1 -- developing the -study}\label{step-1-developing-the-study}} - -\begin{itemize} -\tightlist -\item - Design the study: target/outcome cohort logic, concept sets for - medical definitions, settings for developing new model or validation - of adding existing models to framework. Suggestion: look in literature - for validated definitions. -\item - Write a protocol that motivates the study and provides full details - (sufficient for people to replicate the study in the future). -\item - Write an R package for implementing the study across diverse - computational environments {[}see guidance below for structure of - package and use the skeleton github package here: \ldots{} {]} -\end{itemize} - -\hypertarget{step-2-implementing-the-study-part-1}{% -\subsection{Step 2 -- implementing the study part -1}\label{step-2-implementing-the-study-part-1}} - -\begin{itemize} -\tightlist -\item - Get contributors to install the package and dependencies. Ensure the - package is installed correctly by running the checkInstall functions. -\item - Get contributors to run the createCohort function to inspect the - target/outcome definitions. If the definitions are not suitable for a - site, go back to step 1 and revise the cohort definitions. -\end{itemize} - -\hypertarget{step-3-implementing-the-study-part-2-make-sure-package-checks-outputs-the-package-is-functioning-as-planned-and-the-definitions-are-valid-across-sites}{% -\subsection{Step 3 -- implementing the study part 2 {[}make sure package -checks outputs the package is functioning as planned and the definitions -are valid across -sites{]}}\label{step-3-implementing-the-study-part-2-make-sure-package-checks-outputs-the-package-is-functioning-as-planned-and-the-definitions-are-valid-across-sites}} - -\begin{itemize} -\tightlist -\item - Get contributors to run the main.R with the settings configured to - their environment -\item - Get the contributors to submit the results -\end{itemize} - -\hypertarget{step-4-publication}{% -\subsection{Step 4 -- Publication}\label{step-4-publication}} - -\begin{itemize} -\tightlist -\item - The study creator has the first option to be first author, if he/she - does not wish to be first author then he/she can pick the most - suitable person from the contributors. All contributors will be listed - as authors on the paper. The last author will be the person who - lead/managed the study, if this was the first author then the first - author can pick the most suitable last author. All authors between the - first and last author will be alphabetical by last name. -\end{itemize} - -\hypertarget{package-skeleton---file-structure}{% -\section{Package Skeleton - File -Structure}\label{package-skeleton---file-structure}} - -\begin{itemize} -\tightlist -\item - DESCRIPTION -- This file describes the R package and the dependencies -\item - NAMESPACE -- This file is created automatically by Roxygen -\item - Readme.md -- This file should provide the step by step guidance on - implementing the package -\item - R -\item - helpers.r -- all the custom functions used by the package should be in - this file (e.g., checkInstall) -\item - main.r -- this file will call the functions in helpers.r to execute - the full study -\item - submit.r -- this file will be called at the end the submit the - compressed folder to the study creator/manager. -\item - Man -- this folder will contain the documentation for the functions in - helpers.r (this should be automatically generated by roxygen) -\item - Inst -\item - sql/sql\_sever - - \begin{itemize} - \tightlist - \item - targetCohort -- the target cohort parameterised sql code - \item - outcomeCohort -- the outcome cohort parameterised sql code - \end{itemize} -\item - extdata -- place any data required for the package here -\item - plp\_models -- place any PLP models here -\item - existing\_models -- place the files for existing models here -\item - Extras -\end{itemize} - -\hypertarget{package-skeleton---output-of-running-package}{% -\section{Package Skeleton - Output of Running -Package}\label{package-skeleton---output-of-running-package}} - -The output should contain three folders inside the study directory such -as -\texttt{outputLoc\ \textless{}-\ (file.path(getwd(),\ paste0(studyName\_database\_date)))}: -* Plots -- containing the test/train or validation ROC plot, calibration -plot, precision recall plot and optionally the demographic calibration -plot. * Results -- The output of running savePlpResult * Summary -- a -summary csv of performance and the table 1 csv - -Then there should also be a zip file of the folder in the working -directory containing the same folders and files but with sensitive -results removed (this will be created using the packageResults -function). Once the contributor has inspected the zipped file and is -happy with the content being shared, he/she can then finally run the -submit function with the details provided in the readme.md. - -\hypertarget{example-code-to-make-package-for-external-validation-of-plp-model}{% -\section{Example Code To Make Package For External Validation of PLP -Model}\label{example-code-to-make-package-for-external-validation-of-plp-model}} - -First you need to make a copy of the PatientLevelPrediciton skeleton -package found here: - -Assuming you ran a sucessful PatientLevelPrediction model development -and saved the output of \texttt{runPlp()} to to location `goodModel' in -your working directory then: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(PatientLevelPrediction)} -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\StringTok{"goodModel"}\NormalTok{)} - -\CommentTok{# add the model to the skeleton package with sensitive information removed} -\KeywordTok{exportPlpResult}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, }\DataTypeTok{modelName =} \StringTok{"Model Name"}\NormalTok{, }\DataTypeTok{packageName =} \StringTok{"Your Package Name"}\NormalTok{, } - \DataTypeTok{gitHubLocation =} \StringTok{"location/of/github"}\NormalTok{, }\DataTypeTok{includeEvaluationStatistics =}\NormalTok{ T, }\DataTypeTok{includeThresholdSummary =}\NormalTok{ T, } - \DataTypeTok{includeDemographicSummary =}\NormalTok{ T, }\DataTypeTok{includeCalibrationSummary =}\NormalTok{ T, }\DataTypeTok{includePredictionDistribution =}\NormalTok{ T, } - \DataTypeTok{includeCovariateSummary =}\NormalTok{ F)} -\end{Highlighting} -\end{Shaded} - -Now you want to add the cohorts (generally the parameterized sql -required to create one or more target and outcome cohorts). This should -be added into the inst/sql/sql\_server directory of your package. If you -are using atlas to create the cohorts then you can use: -\texttt{OhdsiRTools::insertCirceDefinitionInPackage()}. The settings for -the cohort creation are defined in the inst/extdata directory in the -file cohort\_details.csv. this file contains two columns: cohortName and -cohortId. The cohortName should contain the name of the sql file of the -cohort in inst/sql/sql\_server (e.g., a file called ``targetCohort.sql'' -has the name ``targetCohort'') and the cohortId is the default -cohort\_definition\_id that will be used when people run the study -corresponding to this cohort. The main.R file in the extras directory -contains the vanilla code to run a study with the model eported into the -package and the cohort files added. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(PatientLevelPrediction)} -\CommentTok{# input settings for person running the study} -\NormalTok{connectionDetails <-}\StringTok{ " "} -\NormalTok{cdmDatabaseSchema <-}\StringTok{ "their_cdm_database"} -\NormalTok{databaseName <-}\StringTok{ "Name for database"} -\NormalTok{cohortDatabaseSchema <-}\StringTok{ "a_database_with_write_priv"} -\NormalTok{cohortTable <-}\StringTok{ "package_table"} -\NormalTok{outputLocation <-}\StringTok{ "location to save results"} - -\NormalTok{cohortDetails <-}\StringTok{ }\KeywordTok{createCohort}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, }\DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema, } - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ cohortDatabaseSchema, }\DataTypeTok{cohortTable =}\NormalTok{ cohortTable, }\DataTypeTok{package =} \StringTok{"Your Package Name"}\NormalTok{)} - -\NormalTok{plpResult <-}\StringTok{ }\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{system.file}\NormalTok{(}\StringTok{"model"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"Your Package Name"}\NormalTok{))} -\NormalTok{result <-}\StringTok{ }\KeywordTok{externalValidatePlp}\NormalTok{(}\DataTypeTok{plpResult =}\NormalTok{ plpResult, }\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, }\DataTypeTok{validationSchemaTarget =}\NormalTok{ cohortDatabaseSchema, } - \DataTypeTok{validationSchemaOutcome =}\NormalTok{ cohortDatabaseSchema, }\DataTypeTok{validationSchemaCdm =}\NormalTok{ cdmDatabaseSchema, }\DataTypeTok{validationTableTarget =}\NormalTok{ cohortTable, } - \DataTypeTok{validationTableOutcome =}\NormalTok{ cohortTable, }\DataTypeTok{validationIdTarget =}\NormalTok{ target_cohort_id, }\DataTypeTok{validationIdOutcome =}\NormalTok{ outcome_cohort_id)} - -\CommentTok{# save results to standard output} -\NormalTok{resultLoc <-}\StringTok{ }\KeywordTok{standardOutput}\NormalTok{(}\DataTypeTok{result =}\NormalTok{ result, }\DataTypeTok{outputLocation =}\NormalTok{ outputLocation, }\DataTypeTok{studyName =} \StringTok{"external validation of ... model"}\NormalTok{, } - \DataTypeTok{databaseName =}\NormalTok{ databaseName, }\DataTypeTok{cohortName =} \StringTok{"your cohortName"}\NormalTok{, }\DataTypeTok{outcomeName =} \StringTok{"your outcomeName"}\NormalTok{)} - -\CommentTok{# package results ready to submit} -\KeywordTok{packageResults}\NormalTok{(}\DataTypeTok{mainFolder =}\NormalTok{ resultLoc, }\DataTypeTok{includeROCplot =}\NormalTok{ T, }\DataTypeTok{includeCalibrationPlot =}\NormalTok{ T, }\DataTypeTok{includePRPlot =}\NormalTok{ T, } - \DataTypeTok{includeTable1 =}\NormalTok{ F, }\DataTypeTok{includeThresholdSummary =}\NormalTok{ T, }\DataTypeTok{includeDemographicSummary =}\NormalTok{ T, }\DataTypeTok{includeCalibrationSummary =}\NormalTok{ T, } - \DataTypeTok{includePredictionDistribution =}\NormalTok{ T, }\DataTypeTok{includeCovariateSummary =}\NormalTok{ F, }\DataTypeTok{removeLessThanN =}\NormalTok{ F, }\DataTypeTok{N =} \DecValTok{10}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Where the target\_cohort\_id and outcome\_cohort\_id should correspond -to the cohort\_details.csv file. - -We recommend getting the network implementors to submit their results of -\texttt{createCohort()} before continuing with the study to ensure -definitions run across the network. After running the rest of main.R the -implementor should inspect the files in the export folder created by the -package to ensure there isn't sensitive data remaining. Once checked the -implementor can run submit.R to send the results to the study organisor. -The submit.R file is: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{submitResults}\NormalTok{(}\DataTypeTok{exportFolder =}\NormalTok{ outputLocation, }\DataTypeTok{dbName =}\NormalTok{ databaseName, key, secret)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{useful-patientlevelprediction-functions}{% -\section{Useful PatientLevelPrediction -Functions}\label{useful-patientlevelprediction-functions}} - -The functions to aid the creation of a network study are: - -\begin{longtable}[]{@{}lll@{}} -\toprule -\begin{minipage}[b]{0.26\columnwidth}\raggedright -Function\strut -\end{minipage} & \begin{minipage}[b]{0.42\columnwidth}\raggedright -Description\strut -\end{minipage} & \begin{minipage}[b]{0.23\columnwidth}\raggedright -Usage\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{checkPlpInstall()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function checks the connection, and various aspects of the PLP -package to check it is set up correctly\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be run with the appropriate settings to check the -contributor is set up correctly for the study\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{getPlpData()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function extracts the data from the cdm for model development\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used if developing new models\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{runPlp()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function trains and tests a new PLP model\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used if developing new models\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{transportPlp()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function exports the output of runPlp into an R package while -removing sensitive objects\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used when saving a model into a study package to validate -the model\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.26\columnwidth}\raggedright -\texttt{externalValidatePlp()}\strut -\end{minipage} & \begin{minipage}[t]{0.42\columnwidth}\raggedright -This function requires the user to inpute an existing model and then -extracts the required data on a new database and applies/evaluates the -model.\strut -\end{minipage} & \begin{minipage}[t]{0.23\columnwidth}\raggedright -This should be used if validating a PLP model\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} - -\end{document} diff --git a/inst/doc/CreatingShinyApp.tex b/inst/doc/CreatingShinyApp.tex deleted file mode 100644 index 1e6884a8a..000000000 --- a/inst/doc/CreatingShinyApp.tex +++ /dev/null @@ -1,502 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Creating Shiny App}, - pdfauthor={Jenna Reps}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Creating Shiny App} -\author{Jenna Reps} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -In this vignette we will show with example code how to create a shiny -app and add the shiny app online for other researcher around the whole -to explore. - -There are two ways to create the shiny app: 1) Using the atlas R -generated prediction package 2) Manually using the -PatientLevelPrediction functions in a script - -We assume you have experience with using the OHDSI -PatientLevelPrediction package to develop and externally validate -prediction models using data in the OMOP CDM. If you do not have -experience with this then please first read our general vignette -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf}{\texttt{BuildingPredictiveModels} -vignette}. - -\hypertarget{atlas-development-shiny-app}{% -\section{Atlas Development Shiny -App}\label{atlas-development-shiny-app}} - -\hypertarget{step-1-run-the-model-development-package-to-get-results}{% -\subsection{Step 1: Run the model development package to get -results}\label{step-1-run-the-model-development-package-to-get-results}} - -To create a shiny app project via the Atlas auto-generated prediction R -package you named `myPackage' you need to run the execute function: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(myPackage)} -\NormalTok{myPackage}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =} \StringTok{'myDatabaseSchema.dbo'}\NormalTok{,} - \DataTypeTok{cdmDatabaseName =} \StringTok{'MyDatabase'}\NormalTok{,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{'myDatabaseSchema.ohdsi_results'}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{'cohort'}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults'}\NormalTok{,} - \DataTypeTok{createProtocol =}\NormalTok{ F,} - \DataTypeTok{createCohorts =}\NormalTok{ F,} - \DataTypeTok{runAnalyses =}\NormalTok{ T,} - \DataTypeTok{createResultsDoc =}\NormalTok{ F,} - \DataTypeTok{packageResults =}\NormalTok{ F,} - \DataTypeTok{createValidationPackage =}\NormalTok{ F, } - \DataTypeTok{minCellCount=} \DecValTok{5}\NormalTok{,} - \DataTypeTok{createShiny =}\NormalTok{ F,} - \DataTypeTok{createJournalDocument =}\NormalTok{ F,} - \DataTypeTok{analysisIdDocument =} \DecValTok{1}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This will extract data based on the settings you supplied in the Atlas -prediction design from cohort tables already generated in your CDM -database schema. The PatientLevelPrediction framework will then run and -develop/evaluate models saving the results to the location specified by -outputFolder (e.g., `C:/myResults'). - -\hypertarget{step-2-create-the-shiny-app}{% -\subsection{Step 2: Create the shiny -app}\label{step-2-create-the-shiny-app}} - -To create a shiny app project with these results you can then simply -run: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{myPackage}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =} \StringTok{'myDatabaseSchema.dbo'}\NormalTok{,} - \DataTypeTok{cdmDatabaseName =} \StringTok{'MyDatabase'}\NormalTok{,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{'myDatabaseSchema.ohdsi_results'}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{'cohort'}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults'}\NormalTok{,} - \DataTypeTok{minCellCount=} \DecValTok{5}\NormalTok{,} - \DataTypeTok{createShiny =}\NormalTok{ T)} -\end{Highlighting} -\end{Shaded} - -making sure the outputFolder is the same location used when you ran the -analysis. This code populates a shiny app project with the results but -removes sensitive date such as connection settings, the -cdmDatabaseSchema setting, the predicton matrix and any sensitive counts -less than `minCellCount' from the covariate summary and performance -evalaution. - -The shiny app project populated with the model development results can -then be found at `{[}outputFolder{]}/ShinyApp' e.g., -`C:/myResults/ShinyApp'. - -\hypertarget{testing-optional-but-recommended}{% -\subsubsection{Testing (Optional but -recommended)}\label{testing-optional-but-recommended}} - -You can test the app by opening the shiny project within the -{[}outputFolder{]}/ShinyApp' folder, double click on the file named -`PLPViewer.Rproj'. This will open an R studio session with the shiny app -project loaded. Now load the `ui.R' files within this R studio session -and you will see a green arrow with the words `Run App' at the top right -of the script. Click on this and the shiny app with open. Note: You may -need to install some R pacakge dependancies for the shiny app to work. - -\hypertarget{step-3-sharing-the-shiny-app}{% -\subsection{Step 3: Sharing the shiny -app}\label{step-3-sharing-the-shiny-app}} - -Once you are happy with your app, you can publish it onto -\url{https://data.ohdsi.org} by adding the folder `ShinyApp' to the -OHDSI githib ShinyDeploy (\url{https://github.com/OHDSI/ShinyDeploy/}). -Continuing the example, we would copy the folder -`{[}outputFolder{]}/ShinyApp' and paste it to the local github clone of -ShinyDeploy. We recommend renaming the folder from `ShinyApp' to a name -that describes your prediction, e.g., `StrokeinAF'. Then commit the -changes and make a pull request to ShinyDeploy. Once accepted your shiny -app will be viewable at `\url{https://data.ohdsi.org}'. If you commited -the folder named `StrokeInAF' then the shiny app will be hosted at -`\url{https://data.ohdsi.org/StrokeInAF}'. - -\hypertarget{atlas-external-validation}{% -\section{Atlas External Validation}\label{atlas-external-validation}} - -To include external validation results you can use the Atlas generated R -study package to create the external validation package: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{myPackage}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{cdmDatabaseSchema =} \StringTok{'myDatabaseSchema.dbo'}\NormalTok{,} - \DataTypeTok{cdmDatabaseName =} \StringTok{'MyDatabase'}\NormalTok{,} - \DataTypeTok{cohortDatabaseSchema =} \StringTok{'myDatabaseSchema.ohdsi_results'}\NormalTok{,} - \DataTypeTok{cohortTable =} \StringTok{'cohort'}\NormalTok{,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults'}\NormalTok{,} - \DataTypeTok{createValidationPackage =}\NormalTok{ T)} -\end{Highlighting} -\end{Shaded} - -This will create a new R package inside the `outputFolder' location with -the word `Validation' appended the name of your development package. For -example, if your `outputFolder' was `C:/myResults' and your development -package was named `myPackage' then the validation package will be found -at: `C:/myResults/myPackageValidation'. When running the valdiation -package make sure to set the `outputFolder' to the Validation folder -within your model development outputFolder location: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{myPackageValidation}\OperatorTok{::}\KeywordTok{execute}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \DataTypeTok{databaseName =}\NormalTok{ databaseName,} - \DataTypeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \DataTypeTok{cohortDatabaseSchema =}\NormalTok{ cohortDatabaseSchema,} - \DataTypeTok{oracleTempSchema =}\NormalTok{ oracleTempSchema,} - \DataTypeTok{cohortTable =}\NormalTok{ cohortTable,} - \DataTypeTok{outputFolder =} \StringTok{'C:/myResults/Validation'}\NormalTok{,} - \DataTypeTok{createCohorts =}\NormalTok{ T,} - \DataTypeTok{runValidation =}\NormalTok{ T,} - \DataTypeTok{packageResults =}\NormalTok{ F,} - \DataTypeTok{minCellCount =} \DecValTok{5}\NormalTok{,} - \DataTypeTok{sampleSize =} \OtherTok{NULL}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Now you can rerun Steps 2-3 to populate the shiny app project that will -also include the validation results (as long as the validation results -are in the Validation folder found in the Step 1 outputFolder location -e.g., in `C:/myResults/Validation'). - -\hypertarget{combining-multiple-atlas-results-into-one-shiny-app}{% -\section{Combining multiple atlas results into one shiny -app:}\label{combining-multiple-atlas-results-into-one-shiny-app}} - -The code below can be used to combine multiple Atlas packages' results -into one shiny app: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{populateMultipleShinyApp <-}\StringTok{ }\ControlFlowTok{function}\NormalTok{(shinyDirectory,} -\NormalTok{ resultDirectory,} - \DataTypeTok{minCellCount =} \DecValTok{10}\NormalTok{,} - \DataTypeTok{databaseName =} \StringTok{'sharable name of development data'}\NormalTok{)\{} - - \CommentTok{#check inputs} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(shinyDirectory))\{} -\NormalTok{ shinyDirectory <-}\StringTok{ }\KeywordTok{system.file}\NormalTok{(}\StringTok{"shiny"}\NormalTok{, }\StringTok{"PLPViewer"}\NormalTok{, }\DataTypeTok{package =} \StringTok{"SkeletonPredictionStudy"}\NormalTok{)} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{missing}\NormalTok{(resultDirectory))\{} - \KeywordTok{stop}\NormalTok{(}\StringTok{'Need to enter the resultDirectory'}\NormalTok{)} -\NormalTok{ \}} - - - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(resultDirectory[i]))\{} - \KeywordTok{stop}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{'resultDirectory '}\NormalTok{,i,}\StringTok{' does not exist'}\NormalTok{))} -\NormalTok{ \}} -\NormalTok{ \}} - -\NormalTok{ outputDirectory <-}\StringTok{ }\KeywordTok{file.path}\NormalTok{(shinyDirectory,}\StringTok{'data'}\NormalTok{)} - - \CommentTok{# create the shiny data folder} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(outputDirectory))\{} - \KeywordTok{dir.create}\NormalTok{(outputDirectory, }\DataTypeTok{recursive =}\NormalTok{ T)} -\NormalTok{ \}} - - - \CommentTok{# need to edit settings ...} -\NormalTok{ files <-}\StringTok{ }\KeywordTok{c}\NormalTok{()} - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \CommentTok{# copy the settings csv} -\NormalTok{ file <-}\StringTok{ }\NormalTok{utils}\OperatorTok{::}\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'settings.csv'}\NormalTok{))} -\NormalTok{ file}\OperatorTok{$}\NormalTok{analysisId <-}\StringTok{ }\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(file}\OperatorTok{$}\NormalTok{analysisId)}\OperatorTok{+}\NormalTok{i} -\NormalTok{ files <-}\StringTok{ }\KeywordTok{rbind}\NormalTok{(files, file)} -\NormalTok{ \}} -\NormalTok{ utils}\OperatorTok{::}\KeywordTok{write.csv}\NormalTok{(files, }\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'settings.csv'}\NormalTok{), }\DataTypeTok{row.names =}\NormalTok{ F)} - - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \CommentTok{# copy each analysis as a rds file and copy the log} -\NormalTok{ files <-}\StringTok{ }\KeywordTok{dir}\NormalTok{(resultDirectory[i], }\DataTypeTok{full.names =}\NormalTok{ F)} -\NormalTok{ files <-}\StringTok{ }\NormalTok{files[}\KeywordTok{grep}\NormalTok{(}\StringTok{'Analysis'}\NormalTok{, files)]} - \ControlFlowTok{for}\NormalTok{(file }\ControlFlowTok{in}\NormalTok{ files)\{} - - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i))))\{} - \KeywordTok{dir.create}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)))} -\NormalTok{ \}} - - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpResult'}\NormalTok{)))\{} -\NormalTok{ res <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{loadPlpResult}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpResult'}\NormalTok{))} -\NormalTok{ res <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{transportPlp}\NormalTok{(res, }\DataTypeTok{n=}\NormalTok{ minCellCount, } - \DataTypeTok{save =}\NormalTok{ F, }\DataTypeTok{dataName =}\NormalTok{ databaseName[i])} - -\NormalTok{ res}\OperatorTok{$}\NormalTok{covariateSummary <-}\StringTok{ }\NormalTok{res}\OperatorTok{$}\NormalTok{covariateSummary[res}\OperatorTok{$}\NormalTok{covariateSummary}\OperatorTok{$}\NormalTok{covariateValue}\OperatorTok{!=}\DecValTok{0}\NormalTok{,]} -\NormalTok{ covSet <-}\StringTok{ }\NormalTok{res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{metaData}\OperatorTok{$}\NormalTok{call}\OperatorTok{$}\NormalTok{covariateSettings} -\NormalTok{ res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{metaData <-}\StringTok{ }\OtherTok{NULL} -\NormalTok{ res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{metaData}\OperatorTok{$}\NormalTok{call}\OperatorTok{$}\NormalTok{covariateSettings <-}\StringTok{ }\NormalTok{covSet} -\NormalTok{ res}\OperatorTok{$}\NormalTok{model}\OperatorTok{$}\NormalTok{predict <-}\StringTok{ }\OtherTok{NULL} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{evaluationStatistics))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{evaluationStatistics[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \} }\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-ev'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{thresholdSummary))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{thresholdSummary[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \}}\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-thres'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{demographicSummary))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{demographicSummary[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \} }\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-dem'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{calibrationSummary))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{calibrationSummary[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \}}\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-cal'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{is.null}\NormalTok{(res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{predictionDistribution))\{} -\NormalTok{ res}\OperatorTok{$}\NormalTok{performanceEvaluation}\OperatorTok{$}\NormalTok{predictionDistribution[,}\DecValTok{1}\NormalTok{] <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ \}}\ControlFlowTok{else}\NormalTok{\{} - \KeywordTok{writeLines}\NormalTok{(}\KeywordTok{paste0}\NormalTok{(resultDirectory[i],file, }\StringTok{'-dist'}\NormalTok{))} -\NormalTok{ \}} - \KeywordTok{saveRDS}\NormalTok{(res, }\KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i), }\StringTok{'plpResult.rds'}\NormalTok{))} -\NormalTok{ \}} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{file.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpLog.txt'}\NormalTok{)))\{} - \KeywordTok{file.copy}\NormalTok{(}\DataTypeTok{from =} \KeywordTok{file.path}\NormalTok{(resultDirectory[i],file, }\StringTok{'plpLog.txt'}\NormalTok{), } - \DataTypeTok{to =} \KeywordTok{file.path}\NormalTok{(outputDirectory,}\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\DecValTok{1000}\OperatorTok{*}\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{,file))}\OperatorTok{+}\NormalTok{i), }\StringTok{'plpLog.txt'}\NormalTok{))} -\NormalTok{ \}} -\NormalTok{ \}} -\NormalTok{ \}} - - - - \ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(resultDirectory))\{} - \CommentTok{# copy any validation results} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{)))\{} -\NormalTok{ valFolders <-}\StringTok{ }\KeywordTok{dir}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{), }\DataTypeTok{full.names =}\NormalTok{ F)} - - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{length}\NormalTok{(valFolders)}\OperatorTok{>}\DecValTok{0}\NormalTok{)\{} - \CommentTok{# move each of the validation rds} - \ControlFlowTok{for}\NormalTok{(valFolder }\ControlFlowTok{in}\NormalTok{ valFolders)\{} - - \CommentTok{# get the analysisIds} -\NormalTok{ valSubfolders <-}\StringTok{ }\KeywordTok{dir}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{,valFolder), }\DataTypeTok{full.names =}\NormalTok{ F)} - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{length}\NormalTok{(valSubfolders)}\OperatorTok{!=}\DecValTok{0}\NormalTok{)\{} - \ControlFlowTok{for}\NormalTok{(valSubfolder }\ControlFlowTok{in}\NormalTok{ valSubfolders )\{} -\NormalTok{ valSubfolderUpdate <-}\StringTok{ }\KeywordTok{paste0}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{, }\KeywordTok{as.double}\NormalTok{(}\KeywordTok{gsub}\NormalTok{(}\StringTok{'Analysis_'}\NormalTok{,}\StringTok{''}\NormalTok{, valSubfolder))}\OperatorTok{*}\DecValTok{1000}\OperatorTok{+}\NormalTok{i)} -\NormalTok{ valOut <-}\StringTok{ }\KeywordTok{file.path}\NormalTok{(valFolder,valSubfolderUpdate)} -\NormalTok{ valOutOld <-}\StringTok{ }\KeywordTok{file.path}\NormalTok{(valFolder,valSubfolder)} - \ControlFlowTok{if}\NormalTok{(}\OperatorTok{!}\KeywordTok{dir.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'Validation'}\NormalTok{,valOut)))\{} - \KeywordTok{dir.create}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'Validation'}\NormalTok{,valOut), }\DataTypeTok{recursive =}\NormalTok{ T)} -\NormalTok{ \}} - - - \ControlFlowTok{if}\NormalTok{(}\KeywordTok{file.exists}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{,valOutOld, }\StringTok{'validationResult.rds'}\NormalTok{)))\{} -\NormalTok{ res <-}\StringTok{ }\KeywordTok{readRDS}\NormalTok{(}\KeywordTok{file.path}\NormalTok{(resultDirectory[i],}\StringTok{'Validation'}\NormalTok{,valOutOld, }\StringTok{'validationResult.rds'}\NormalTok{))} -\NormalTok{ res <-}\StringTok{ }\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{transportPlp}\NormalTok{(res, }\DataTypeTok{n=}\NormalTok{ minCellCount, } - \DataTypeTok{save =}\NormalTok{ F, }\DataTypeTok{dataName =}\NormalTok{ databaseName[i])} -\NormalTok{ res}\OperatorTok{$}\NormalTok{covariateSummary <-}\StringTok{ }\NormalTok{res}\OperatorTok{$}\NormalTok{covariateSummary[res}\OperatorTok{$}\NormalTok{covariateSummary}\OperatorTok{$}\NormalTok{covariateValue}\OperatorTok{!=}\DecValTok{0}\NormalTok{,]} - \KeywordTok{saveRDS}\NormalTok{(res, }\KeywordTok{file.path}\NormalTok{(outputDirectory,}\StringTok{'Validation'}\NormalTok{,valOut, }\StringTok{'validationResult.rds'}\NormalTok{))} -\NormalTok{ \}} -\NormalTok{ \}} -\NormalTok{ \}} - -\NormalTok{ \}} - -\NormalTok{ \}} - -\NormalTok{ \}} -\NormalTok{ \}} - - \KeywordTok{return}\NormalTok{(outputDirectory)} - -\NormalTok{\}} -\end{Highlighting} -\end{Shaded} - -\hypertarget{example-code-to-combine-multiple-results}{% -\subsection{Example code to combine multiple -results}\label{example-code-to-combine-multiple-results}} - -The following code will combine the results found in `C:/myResults', -`C:/myResults2' and `C:/myResults3' into the shiny project at -`C:/R/library/myPackage/shiny/PLPViewer': - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{populateMultipleShinyApp}\NormalTok{(}\DataTypeTok{shinyDirectory =} \StringTok{'C:/R/library/myPackage/shiny/PLPViewer'}\NormalTok{,} - \DataTypeTok{resultDirectory =} \KeywordTok{c}\NormalTok{(}\StringTok{'C:/myResults'}\NormalTok{,} - \StringTok{'C:/myResults2'}\NormalTok{,} - \StringTok{'C:/myResults3'}\NormalTok{),} - \DataTypeTok{minCellCount =} \DecValTok{0}\NormalTok{,} - \DataTypeTok{databaseName =} \KeywordTok{c}\NormalTok{(}\StringTok{'database1'}\NormalTok{,}\StringTok{'database2'}\NormalTok{,}\StringTok{'database3'}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\hypertarget{manual-app-creation}{% -\section{Manual App Creation}\label{manual-app-creation}} - -{[}instructions coming soon{]} - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -\end{document} diff --git a/inst/doc/Figure1.png b/inst/doc/Figure1.png deleted file mode 100644 index 878a509e2..000000000 Binary files a/inst/doc/Figure1.png and /dev/null differ diff --git a/inst/doc/InstallationGuide.pdf b/inst/doc/InstallationGuide.pdf index 4734063a1..605054361 100644 Binary files a/inst/doc/InstallationGuide.pdf and b/inst/doc/InstallationGuide.pdf differ diff --git a/inst/doc/InstallationGuide.tex b/inst/doc/InstallationGuide.tex deleted file mode 100644 index 937f4c3e7..000000000 --- a/inst/doc/InstallationGuide.tex +++ /dev/null @@ -1,369 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{lmodern} -\usepackage{amssymb,amsmath} -\usepackage{ifxetex,ifluatex} -\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\hypersetup{ - pdftitle={Patient-Level Prediction Installation Guide}, - pdfauthor={Jenna Reps, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} -\urlstyle{same} % disable monospaced font for URLs -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{graphicx,grffile} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 3.1.0} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} - -\title{Patient-Level Prediction Installation Guide} -\author{Jenna Reps, Peter R. Rijnbeek} -\date{2020-06-03} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{2} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -This vignette describes how you need to install the Observational Health -Data Sciencs and Informatics (OHDSI) -\href{http://github.com/OHDSI/PatientLevelPrediction}{\texttt{PatientLevelPrediction}} -package under Windows, Mac, and Linux. - -\hypertarget{software-prerequisites}{% -\section{Software Prerequisites}\label{software-prerequisites}} - -\hypertarget{windows-users}{% -\subsection{Windows Users}\label{windows-users}} - -Under Windows the OHDSI Patient Level Prediction (PLP) package requires -installing: - -\begin{itemize} -\tightlist -\item - R (\url{https://cran.cnr.berkeley.edu/} ) - (R \textgreater= 3.3.0, - but latest is recommended) -\item - Rstudio (\url{https://www.rstudio.com/} ) -\item - Java (\url{http://www.java.com} ) -\item - RTools (\url{https://cran.r-project.org/bin/windows/Rtools/}) -\end{itemize} - -\hypertarget{maclinux-users}{% -\subsection{Mac/Linux Users}\label{maclinux-users}} - -Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package -requires installing: - -\begin{itemize} -\tightlist -\item - R (\url{https://cran.cnr.berkeley.edu/} ) - (R \textgreater= 3.3.0, - but latest is recommended) -\item - Rstudio (\url{https://www.rstudio.com/} ) -\item - Java (\url{http://www.java.com} ) -\item - Xcode command line tools(run in terminal: xcode-select --install) - {[}MAC USERS ONLY{]} -\end{itemize} - -\hypertarget{installing-the-package}{% -\section{Installing the Package}\label{installing-the-package}} - -The preferred way to install the package is by using drat, which will -automatically install the latest release and all the latest -dependencies. If the drat code fails or you do not want the official -release you could use devtools to install the bleading edge version of -the package (latest master). Note that the latest master could contain -bugs, please report them to us if you experience problems. - -\hypertarget{installing-patientlevelprediction-using-drat}{% -\subsection{Installing PatientLevelPrediction using -drat}\label{installing-patientlevelprediction-using-drat}} - -To install using drat run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{install.packages}\NormalTok{(}\StringTok{"drat"}\NormalTok{)} -\NormalTok{drat}\OperatorTok{::}\KeywordTok{addRepo}\NormalTok{(}\StringTok{"OHDSI"}\NormalTok{)} -\KeywordTok{install.packages}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{installing-patientlevelprediction-using-devtools}{% -\subsection{Installing PatientLevelPrediction using -devtools}\label{installing-patientlevelprediction-using-devtools}} - -To install using devtools run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{install.packages}\NormalTok{(}\StringTok{'devtools'}\NormalTok{)} -\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install_github}\NormalTok{(}\StringTok{"OHDSI/FeatureExtraction"}\NormalTok{)} -\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install_github}\NormalTok{(}\StringTok{'ohdsi/PatientLevelPrediction'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -When installing using devtools make sure to close any other Rstudio -sessions that are using PatientLevelPrediction or any dependency. -Keeping Rstudio sessions open can cause locks that prevent the package -installing. - -\hypertarget{creating-python-reticulate-environment}{% -\section{Creating Python Reticulate -Environment}\label{creating-python-reticulate-environment}} - -Many of the classifiers in the PatientLevelPrediction use a Python back -end. To set up a python environment run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(PatientLevelPrediction)} -\NormalTok{reticulate}\OperatorTok{::}\KeywordTok{install_miniconda}\NormalTok{()} -\KeywordTok{configurePython}\NormalTok{(}\DataTypeTok{envname=}\StringTok{'r-reticulate'}\NormalTok{, }\DataTypeTok{envtype=}\StringTok{'conda'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -To add the R keras interface, in Rstudio run: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install_github}\NormalTok{(}\StringTok{"rstudio/keras"}\NormalTok{)} -\KeywordTok{library}\NormalTok{(keras)} -\KeywordTok{install_keras}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -Some of the less frequently used classifiers are not installed during -this set-up to add them run: - -For GBM survival: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{reticulate}\OperatorTok{::}\KeywordTok{conda_install}\NormalTok{(}\DataTypeTok{envname=}\StringTok{'r-reticulate'}\NormalTok{, }\DataTypeTok{packages =} \KeywordTok{c}\NormalTok{(}\StringTok{'scikit-survival'}\NormalTok{), }\DataTypeTok{forge =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{pip =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{pip_ignore_installed =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{conda =} \StringTok{"auto"}\NormalTok{, }\DataTypeTok{channel =} \StringTok{'sebp'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -For any of the torch models: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{reticulate}\OperatorTok{::}\KeywordTok{conda_install}\NormalTok{(}\DataTypeTok{envname=}\StringTok{'r-reticulate'}\NormalTok{, }\DataTypeTok{packages =} \KeywordTok{c}\NormalTok{(}\StringTok{'pytorch'}\NormalTok{, }\StringTok{'torchvision'}\NormalTok{, }\StringTok{'cpuonly'}\NormalTok{), }\DataTypeTok{forge =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{pip =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{channel =} \StringTok{'pytorch'}\NormalTok{, }\DataTypeTok{pip_ignore_installed =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{conda =} \StringTok{'auto'}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\hypertarget{testing-installation}{% -\section{Testing installation}\label{testing-installation}} - -To test whether the package is installed correctly run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(DatabaseConnector)} -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{'sql_server'}\NormalTok{, } - \DataTypeTok{user =} \StringTok{'username'}\NormalTok{, } - \DataTypeTok{password =} \StringTok{'hidden'}\NormalTok{, } - \DataTypeTok{server =} \StringTok{'your server'}\NormalTok{, } - \DataTypeTok{port =} \StringTok{'your port'}\NormalTok{)} -\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{checkPlpInstallation}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, } - \DataTypeTok{python =}\NormalTok{ T)} -\end{Highlighting} -\end{Shaded} - -To test the installation (excluding python) run: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{library}\NormalTok{(DatabaseConnector)} -\NormalTok{connectionDetails <-}\StringTok{ }\KeywordTok{createConnectionDetails}\NormalTok{(}\DataTypeTok{dbms =} \StringTok{'sql_server'}\NormalTok{, } - \DataTypeTok{user =} \StringTok{'username'}\NormalTok{, } - \DataTypeTok{password =} \StringTok{'hidden'}\NormalTok{, } - \DataTypeTok{server =} \StringTok{'your server'}\NormalTok{, } - \DataTypeTok{port =} \StringTok{'your port'}\NormalTok{)} -\NormalTok{PatientLevelPrediction}\OperatorTok{::}\KeywordTok{checkPlpInstallation}\NormalTok{(}\DataTypeTok{connectionDetails =}\NormalTok{ connectionDetails, } - \DataTypeTok{python =}\NormalTok{ F)} -\end{Highlighting} -\end{Shaded} - -The check can take a while to run since it will build the following -models in sequence on simulated \url{data:Logistic} Regression, -RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient -Boosting. Moreover, it will test the database connection. - -\hypertarget{installation-issues}{% -\section{Installation issues}\label{installation-issues}} - -Installation issues need to be posted in our issue tracker: -\url{http://github.com/OHDSI/PatientLevelPrediction/issues} - -The list below provides solutions for some common issues: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\item - If you have an error when trying to install a package in R saying - \textbf{`Dependancy X not available \ldots{}'} then this can sometimes - be fixed by running - \texttt{install.packages(\textquotesingle{}X\textquotesingle{})} and - then once that completes trying to reinstall the package that had the - error. -\item - I have found that using the github devtools to install packages can be - impacted if you have \textbf{multiple R sessions} open as one session - with a library open can causethe library to be locked and this can - prevent an install of a package that depends on that library. -\end{enumerate} - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -This work is supported in part through the National Science Foundation -grant IIS 1251151. - -\end{document} diff --git a/inst/settings/rEnvironmentSnapshot.csv b/inst/settings/rEnvironmentSnapshot.csv deleted file mode 100644 index 269bb3316..000000000 --- a/inst/settings/rEnvironmentSnapshot.csv +++ /dev/null @@ -1,115 +0,0 @@ -"package","version" -"R","3.5.0" -"grDevices","3.5.0" -"graphics","3.5.0" -"utils","3.5.0" -"stats","3.5.0" -"methods","3.5.0" -"tools","3.5.0" -"assertthat","0.2.0" -"crayon","1.3.4" -"cli","1.0.0" -"rlang","0.2.0" -"utf8","1.1.3" -"grid","3.5.0" -"pillar","1.2.2" -"Rcpp","0.12.19" -"colorspace","1.3-2" -"glue","1.2.0" -"later","0.7.2" -"lattice","0.20-35" -"magrittr","1.5" -"R6","2.2.2" -"stringi","1.1.7" -"tibble","1.4.2" -"bindr","0.1.1" -"bit","1.1-14" -"dichromat","2.0-0" -"digest","0.6.15" -"jsonlite","1.5" -"labeling","0.3" -"Matrix","1.2-14" -"munsell","0.4.3" -"plyr","1.8.4" -"promises","1.0.1" -"purrr","0.2.4" -"RColorBrewer","1.1-2" -"stringr","1.3.1" -"viridisLite","0.3.0" -"yaml","2.1.19" -"base64enc","0.1-3" -"bindrcpp","0.2.2" -"config","0.3" -"debugme","1.1.0" -"fastmatch","1.1-0" -"ff","2.2-14" -"gtable","0.2.0" -"htmltools","0.3.6" -"httpuv","1.4.3" -"lazyeval","0.2.1" -"MASS","7.3-49" -"mime","0.5" -"pkgconfig","2.0.1" -"reshape2","1.4.3" -"reticulate","1.7" -"rJava","0.9-10" -"rstudioapi","0.7" -"scales","0.5.0" -"sourcetools","0.1.7" -"tidyselect","0.2.4" -"triebeard","0.3.0" -"whisker","0.3-2" -"xtable","1.8-2" -"codetools","0.2-15" -"compiler","3.5.0" -"curl","3.2" -"DatabaseConnectorJars","1.0.0" -"DBI","1.0.0" -"dplyr","0.7.4" -"ffbase","0.12.7" -"ggplot2","2.2.1" -"iterators","1.0.10" -"openssl","1.0.1" -"processx","3.0.3" -"shiny","1.0.5" -"splines","3.5.0" -"SqlRender","1.6.0" -"tfruns","1.3" -"urltools","1.7.0" -"bitops","1.0-6" -"crosstalk","1.0.0" -"data.table","1.11.2" -"DatabaseConnector","2.2.1" -"foreach","1.4.4" -"hexbin","1.27.2" -"htmlwidgets","1.2" -"httr","1.3.1" -"parallel","3.5.0" -"RcppParallel","4.4.0" -"shape","1.4.4" -"snow","0.4-2" -"survival","2.41-3" -"tensorflow","1.5" -"tidyr","0.8.0" -"uuid","0.1-2" -"XML","3.98-1.13" -"xml2","1.2.0" -"zeallot","0.1.0" -"zip","1.0.0" -"AUC","0.3.0" -"Cyclops","1.3.2" -"diagram","1.6.4" -"doParallel","1.0.11" -"FeatureExtraction","2.2.1" -"gridExtra","2.3" -"keras","2.1.6" -"officer","0.3.1" -"ParallelLogger","1.1.0" -"plotly","4.7.1" -"PRROC","1.3.1" -"RCurl","1.95-4.10" -"RJSONIO","1.3-0" -"slam","0.1-43" -"survAUC","1.0-5" -"xgboost","0.6.4.1" -"PatientLevelPrediction","3.0.0" diff --git a/inst/settings/resultsDataModelSpecification.csv b/inst/settings/resultsDataModelSpecification.csv new file mode 100644 index 000000000..21f196bb2 --- /dev/null +++ b/inst/settings/resultsDataModelSpecification.csv @@ -0,0 +1,200 @@ +table_name,column_name,data_type,is_required,primary_key,empty_is_na,mincellcount,description +cohorts,cohort_id,int,Yes,Yes,No,No,a unique identifier for the cohort in the plp results database +cohorts,cohort_definition_id,bigint,Yes,No,No,No,the identifier in ATLAS for the cohort +cohorts,cohort_name,varchar,Yes,No,No,No,the name of the cohort +cohort_definition,cohort_definition_id,bigint,Yes,No,No,No,The ATLAS cohort definition id +cohort_definition,cohort_name,varchar,Yes,No,No,No,The name of the cohort +cohort_definition,description,text,No,No,Yes,No,A description of the cohort +cohort_definition,json,text,Yes,No,No,No,The json spec for the cohort +cohort_definition,sql_command,text,No,No,Yes,No,The SQL used to create the cohort +database_meta_data,database_id,varchar,Yes,Yes,Yes,No,The shared databaseId +database_meta_data,cdm_source_name,varchar,Yes,No,Yes,No,The name of the database +database_meta_data,cdm_source_abbreviation,varchar,Yes,No,Yes,No,The abbreviated name of the database +database_meta_data,cdm_holder,varchar,No,No,Yes,No,The owner of the CDM +database_meta_data,source_description,text,No,No,Yes,No,The full description of the database +database_meta_data,source_documentation_reference,varchar,No,No,Yes,No,The link to the documentation +database_meta_data,cdm_etl_reference,varchar,No,No,Yes,No,The link to the ETL document +database_meta_data,source_release_date,varchar,No,No,Yes,No,The release date for the data +database_meta_data,cdm_release_date,varchar,No,No,Yes,No,The release date for the CDM data +database_meta_data,cdm_version,varchar,No,No,Yes,No,The vocabulary version +database_meta_data,vocabulary_version,varchar,No,No,Yes,No,The max date in the database +database_meta_data,max_obs_period_end_date,varchar,No,No,Yes,No, +database_details,database_id,int,Yes,Yes,No,No,a unique identifier for the database +database_details,database_meta_data_id,varchar,Yes,No,No,No,The shared databaseId +tars,tar_id,int,Yes,Yes,No,No,a unique identifier for the tar +tars,tar_start_day,int,Yes,No,No,No,the number of days offset the tar_start_anchor for the time-at-risk to start +tars,tar_start_anchor,varchar,Yes,No,No,No,whether to use the cohort start or cohort end +tars,tar_end_day,int,Yes,No,No,No,the number of days offset the tar_end_anchor for the time-at-risk to end +tars,tar_end_anchor,varchar,Yes,No,No,No,whether to use the cohort start or cohort end +population_settings,population_setting_id,int,Yes,Yes,No,No,a unique identifier for the population settings +population_settings,population_settings_json,text,Yes,No,No,No,the json with the settings +covariate_settings,covariate_setting_id,int,Yes,Yes,No,No,a unique identifier for the covaraite settings +covariate_settings,covariate_settings_json,text,Yes,No,No,No,the json with the settings +model_settings,model_setting_id,int,Yes,Yes,No,No,a unique identifier for the model settings +model_settings,model_type,varchar,No,No,No,No,the type of model +model_settings,model_settings_json,varchar,Yes,No,No,No,the json with the settings +split_settings,split_setting_id,int,Yes,Yes,No,No,a unique identifier for the split settings +split_settings,split_settings_json,text,Yes,No,No,No,the json with the settings +plp_data_settings,plp_data_setting_id,int,Yes,Yes,No,No,a unique identifier for the plp data settings +plp_data_settings,plp_data_settings_json,text,Yes,No,No,No,the json with the settings +feature_engineering_settings,feature_engineering_setting_id,int,Yes,Yes,No,No,a unique identifier for the feature engineering settings +feature_engineering_settings,feature_engineering_settings_json,text,Yes,No,No,No,the json with the settings +tidy_covariates_settings,tidy_covariates_setting_id,int,Yes,Yes,No,No,a unique identifier for the tidy covariates settings +tidy_covariates_settings,tidy_covariates_settings_json,text,Yes,No,No,No,the json with the settings +sample_settings,sample_setting_id,int,Yes,Yes,No,No,a unique identifier for the sample settings +sample_settings,sample_settings_json,text,Yes,No,No,No,the json with the settings +model_designs,model_design_id,int,Yes,Yes,No,No,a unique identifier for the model design settings +model_designs,target_id,int,Yes,No,No,No,the identifier for the target cohort id +model_designs,outcome_id,int,Yes,No,No,No,the identifier for the outcome cohort id +model_designs,tar_id,int,Yes,No,No,No,the identifier for the time at risk setting +model_designs,plp_data_setting_id,int,Yes,No,No,No,the identifier for the plp data setting +model_designs,population_setting_id,int,Yes,No,No,No,the identifier for the population setting +model_designs,model_setting_id,int,Yes,No,No,No,the identifier for the model setting +model_designs,covariate_setting_id,int,Yes,No,No,No,the identifier for the covaraite setting +model_designs,sample_setting_id,int,Yes,No,No,No,the identifier for the sample setting +model_designs,split_setting_id,int,Yes,No,No,No,the identifier for the split setting +model_designs,feature_engineering_setting_id,int,Yes,No,No,No,the identifier for the feature engineering setting +model_designs,tidy_covariates_setting_id,int,Yes,No,No,No,the identifier for the tidy covariate setting +diagnostics,diagnostic_id,int,Yes,Yes,No,No,the unique identifier for the diagnostic results +diagnostics,model_design_id,int,Yes,No,No,No,the identifier for the model design +diagnostics,database_id,int,Yes,No,No,No,the identifier for the database +diagnostics,execution_date_time,varchar,No,No,No,No,the date/time the diagnostic was run +diagnostic_summary,diagnostic_id,int,Yes,No,No,No,the identifier for the diagnostics +diagnostic_summary,probast_id,varchar,Yes,No,No,No,the probast id being diagnosed +diagnostic_summary,result_value,varchar,Yes,No,No,No,the diagnostic result +diagnostic_predictors,diagnostic_id,int,Yes,No,No,No,the identifier for the diagnostics +diagnostic_predictors,days_to_event,int,Yes,No,No,No,the time between index to the day of interest +diagnostic_predictors,outcome_at_time,int,Yes,No,No,Yes,the number of outcomes on the day of interest +diagnostic_predictors,observed_at_start_of_day,bigint,Yes,No,No,Yes,the number of people observed up to the day of interest +diagnostic_predictors,input_type,varchar,Yes,No,No,No,the setting id the results are for +diagnostic_participants,diagnostic_id,int,Yes,No,No,No,the identifier for the diagnostics +diagnostic_participants,design,varchar,Yes,No,No,No,the inclusion criteria of interest +diagnostic_participants,metric,varchar,Yes,No,No,No,the metric calculated +diagnostic_participants,value,float,Yes,No,No,No,the value calculated +diagnostic_participants,probast_id,varchar,Yes,No,No,No,the corresponding probast id +diagnostic_outcomes,diagnostic_id,int,Yes,No,No,No,the identifier for the diagnostics +diagnostic_outcomes,xvalue,int,Yes,No,No,No,the value for the x-axis +diagnostic_outcomes,outcome_percent,float,Yes,No,No,No,the percentage of people with the outcome +diagnostic_outcomes,aggregation,varchar,Yes,No,No,No,"the type of aggregation (age,sex, year)" +diagnostic_outcomes,probast_id,varchar,Yes,No,No,No,the corresponding probast id +diagnostic_outcomes,input_type,varchar,Yes,No,No,No,the inclusion criteria of interest +diagnostic_designs,diagnostic_id,int,Yes,Yes,No,No,the identifier for the diagnostics +diagnostic_designs,probast_id,varchar,Yes,No,No,No,not used +diagnostic_designs,value,varchar,Yes,No,No,No,not used +models,model_id,int,Yes,Yes,No,No,A unique identifier for the model +models,analysis_id,varchar,No,No,No,No,The analysis id from the model +models,model_design_id,int,Yes,No,No,No,The corresponding model design id +models,database_id,int,Yes,No,No,No,The corresponding database id +models,model_type,varchar,No,No,No,No,The type of model +models,plp_model_file,text,Yes,No,No,No,A directory where the model is saved +models,train_details,text,Yes,No,No,No,json containing the training details +models,preprocessing,text,No,No,No,No,json containing the preprocessing details +models,execution_date_time,varchar,No,No,No,No,the date/time the model was trained +models,training_time,varchar,No,No,No,No,the time it took to develop the model +models,intercept,float,No,No,No,No,the intercept (if the model is a GLM) +recalibrations,recalibration_id,int,Yes,Yes,No,No,A unique identifier for the recalibration +recalibrations,original_model_id,int,Yes,No,No,No,The corresponding uncalibrated model id +recalibrations,recalibrated_model_id,int,Yes,No,No,No,The model id for the recalibrated model +recalibrations,recalibration_type,varchar,Yes,No,No,No,The type of recalibration +recalibrations,recalibration_json,varchar,Yes,No,No,No,The recalibration details +performances,performance_id,int,Yes,Yes,No,No,A unique identifier for the performance +performances,model_design_id,int,Yes,No,No,No,The corresponding model design id for development +performances,development_database_id,int,Yes,No,No,No,The corresponding development database is +performances,validation_database_id,int,Yes,No,No,No,The corresponding validation database is +performances,target_id,int,Yes,No,No,No,The corresponding validation target cohort id +performances,outcome_id,int,Yes,No,No,No,The corresponding validation outcome cohort id +performances,tar_id,int,Yes,No,No,No,The corresponding validation time at risk id +performances,plp_data_setting_id,int,Yes,No,No,No,The corresponding validation data settings id +performances,population_setting_id,int,Yes,No,No,No,The corresponding validation population settings id +performances,model_development,int,No,No,No,No,flag whether the performage is development or validation +performances,execution_date_time,varchar,Yes,No,No,No,The date/time the validation was executed +performances,plp_version,varchar,Yes,No,No,No,The PLP version for the validation execution +attrition,performance_id,int,Yes,No,No,No,The corresponding performance id +attrition,outcome_id,int,Yes,No,No,No,The corresponding outcome id +attrition,description,varchar,Yes,No,No,No,A description of the inclusions/exclusion +attrition,target_count,int,Yes,No,No,No,The number of target patients remaining +attrition,unique_people,int,Yes,No,No,No,The number of distinct target patients remaining +attrition,outcomes,int,Yes,No,No,No,The number of target patients with the outcome remaining +prediction_distribution,performance_id,int,Yes,No,No,No,The corresponding performance id +prediction_distribution,evaluation,varchar,Yes,No,No,No,The type of evalaution (test/train/CV) +prediction_distribution,class_label,int,Yes,No,No,No,whether the group is the with outcome or without outcome +prediction_distribution,person_count,int,Yes,No,No,No,the number of patients +prediction_distribution,average_predicted_probability,float,Yes,No,No,No,the mean predicted risk +prediction_distribution,st_dev_predicted_probability,float,Yes,No,No,No,the standard deviation of the predicted risk +prediction_distribution,min_predicted_probability,float,Yes,No,No,No,the min predicted risk +prediction_distribution,p_05_predicted_probability,float,Yes,No,No,No,the 5% quantile of predicted risk +prediction_distribution,p_25_predicted_probability,float,Yes,No,No,No,the 25% quantile of predicted risk +prediction_distribution,median_predicted_probability,float,Yes,No,No,No,The median predicted risk +prediction_distribution,p_75_predicted_probability,float,Yes,No,No,No,the 75% quantile of predicted risk +prediction_distribution,p_95_predicted_probability,float,Yes,No,No,No,the 95% quantile of predicted risk +prediction_distribution,max_predicted_probability,float,Yes,No,No,No,the max predicted risk +covariate_summary,performance_id,int,Yes,No,No,No,The corresponding performance id +covariate_summary,covariate_id,bigint,Yes,No,No,No,The id for the covariate +covariate_summary,covariate_name,varchar,Yes,No,No,No,the name for the covariate +covariate_summary,concept_id,float,Yes,No,No,No,the concept id used to construct the covariate +covariate_summary,covariate_value,float,Yes,No,No,No,the coefficient or covariate importance +covariate_summary,covariate_count,int,Yes,No,No,Yes,the number of people with the covariate +covariate_summary,covariate_mean,float,Yes,No,No,No,the mean value +covariate_summary,covariate_st_dev,float,Yes,No,No,No,the standard deviation of the values +covariate_summary,with_no_outcome_covariate_count,int,Yes,No,No,Yes,the number of people without the outcome with the covariate +covariate_summary,with_no_outcome_covariate_mean,float,Yes,No,No,No,the mean value for people without the outcome +covariate_summary,with_no_outcome_covariate_st_dev,float,Yes,No,No,No,the standard deviation of the values for people without the outcome +covariate_summary,with_outcome_covariate_count,int,Yes,No,No,Yes,the number of people with the outcome with the covariate +covariate_summary,with_outcome_covariate_mean,float,Yes,No,No,No,the mean value for people with the outcome +covariate_summary,with_outcome_covariate_st_dev,float,Yes,No,No,No,the standard deviation of the values for people with the outcome +covariate_summary,standardized_mean_diff,float,Yes,No,No,No,The standardized mean difference for those with and without the outcome +threshold_summary,performance_id,int,Yes,No,No,No,The corresponding performance id +threshold_summary,evaluation,varchar,Yes,No,No,No,The type of evalaution (test/train/CV) +threshold_summary,prediction_threshold,float,Yes,No,No,No,The cut-off value being summarised +threshold_summary,preference_threshold,float,Yes,No,No,No,the preference score of the cut-off value +threshold_summary,positive_count,int,Yes,No,No,No,the number of patients predicted to have the outcome at the cut-off +threshold_summary,negative_count,int,Yes,No,No,No,the number of patients predicted to not have the outcome at the cut-off +threshold_summary,true_count,int,Yes,No,No,No,the number of patients with the outcome +threshold_summary,false_count,int,Yes,No,No,No,the number of patients without the outcome +threshold_summary,true_positive_count,int,Yes,No,No,No,the number of patients correctly predicted to have the outcome at the cut-off +threshold_summary,true_negative_count,int,Yes,No,No,No,the number of patients correctly predicted to not have the outcome at the cut-off +threshold_summary,false_positive_count,int,Yes,No,No,No,the number of patients incorrectly predicted to have the outcome at the cut-off +threshold_summary,false_negative_count,int,Yes,No,No,No,the number of patients incorrectly predicted to not have the outcome at the cut-off +threshold_summary,f_1_score,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,accuracy,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,sensitivity,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,false_negative_rate,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,false_positive_rate,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,specificity ,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,positive_predictive_value,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,false_discovery_rate,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,negative_predictive_value,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,false_omission_rate,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,positive_likelihood_ratio,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,negative_likelihood_ratio,float,No,No,Yes,No,the named metric at the cut-off +threshold_summary,diagnostic_odds_ratio,float,No,No,Yes,No,the named metric at the cut-off +calibration_summary,performance_id,int,Yes,No,No,No,The corresponding performance id +calibration_summary,evaluation,varchar,Yes,No,No,No,The type of evalaution (test/train/CV) +calibration_summary,prediction_threshold,float,Yes,No,No,No,The cut-off value being summarised +calibration_summary,person_count_at_risk,int,Yes,No,No,Yes,The number of people in the target population +calibration_summary,person_count_with_outcome,int,Yes,No,No,Yes,The number of target patients with the outcome during TAR +calibration_summary,average_predicted_probability,float,Yes,No,No,No,the mean predicted risk +calibration_summary,st_dev_predicted_probability,float,Yes,No,No,No,the standard deviation of the predicted risk +calibration_summary,min_predicted_probability,float,Yes,No,No,No,the min predicted risk +calibration_summary,p_25_predicted_probability,float,Yes,No,No,No,the 25% quantile of predicted risk +calibration_summary,median_predicted_probability,float,Yes,No,No,No,The median predicted risk +calibration_summary,p_75_predicted_probability,float,Yes,No,No,No,the 75% quantile of predicted risk +calibration_summary,max_predicted_probability,float,Yes,No,No,No,the max predicted risk +calibration_summary,observed_incidence,float,Yes,No,No,No,The true incidence (outcome %) +evaluation_statistics,performance_id,int,Yes,No,No,No,The corresponding performance id +evaluation_statistics,evaluation,varchar,Yes,No,No,No,The type of evalaution (test/train/CV) +evaluation_statistics,metric,varchar,Yes,No,No,No,The metric of interest +evaluation_statistics,value,float,Yes,No,No,No,The value for the metric of interest +demographic_summary,performance_id,int,Yes,No,No,No,The corresponding performance id +demographic_summary,evaluation,varchar,Yes,No,No,No,The type of evalaution (test/train/CV) +demographic_summary,age_group,varchar,Yes,No,No,No,The age group of interest +demographic_summary,gen_group,varchar,Yes,No,No,No,The gender of interest +demographic_summary,person_count_at_risk,int,Yes,No,No,Yes,The number of target patients with the age/gender of interest +demographic_summary,person_count_with_outcome,int,Yes,No,No,Yes,The number of target patients with the age/gender of interest who also have the outcome during TAR +demographic_summary,average_predicted_probability,float,Yes,No,No,No,the mean predicted risk +demographic_summary,st_dev_predicted_probability,float,Yes,No,No,No,the standard deviation of the predicted risk +demographic_summary,min_predicted_probability,float,Yes,No,No,No,the min predicted risk +demographic_summary,p_25_predicted_probability,float,Yes,No,No,No,the 25% quantile of predicted risk +demographic_summary,p_50_predicted_probability,float,Yes,No,No,No,The median predicted risk +demographic_summary,p_75_predicted_probability,float,Yes,No,No,No,the 75% quantile of predicted risk +demographic_summary,max_predicted_probability,float,Yes,No,No,No,the max predicted risk \ No newline at end of file diff --git a/inst/shiny/DiagnosticsExplorer/DiagnosticsExplorer.Rproj b/inst/shiny/DiagnosticsExplorer/DiagnosticsExplorer.Rproj deleted file mode 100644 index 8e3c2ebc9..000000000 --- a/inst/shiny/DiagnosticsExplorer/DiagnosticsExplorer.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/inst/shiny/DiagnosticsExplorer/PlotsAndTables.R b/inst/shiny/DiagnosticsExplorer/PlotsAndTables.R deleted file mode 100644 index 007038fc4..000000000 --- a/inst/shiny/DiagnosticsExplorer/PlotsAndTables.R +++ /dev/null @@ -1,66 +0,0 @@ -getPercentileTable <- function(table,mytargetid,myoutcomeid,mydatabasename,columnName){ - myVar <- dplyr::sym(columnName) # giving error on my setup - - result <- table %>% - filter(targetid==mytargetid & - outcomeid==myoutcomeid) - result<- result %>% filter(type %in% c('N','0%','1%','5%','25%','50%','75%','95%','99%','100%')) - - if (!is.null(mydatabasename)){ - result <- result %>% filter(databasename==mydatabasename) %>% - select(!!c(year = 'year',myVar = columnName, type ='type')) %>% - arrange(year) %>% - spread(key=type,value=myVar,fill='NA') %>% - select(year,N,'0%','1%','5%','25%','50%','75%','95%','99%','100%') - } else { - result <- result %>% select(!!c(databasename = 'databasename', year = 'year', myVar = columnName, type ='type')) %>% - arrange(year) %>% - spread(key=type,value=myVar,fill='NA') %>% - select(databasename,year,N,'0%','1%','5%','25%','50%','75%','95%','99%','100%') - } - - return(result) -} - - -# box plots -myBoxplot <- function(myDatabasename,targetId,outcomeId,variable){ - - plotdata <- getPercentileTable(distribution, - targetId, - outcomeId, - NULL, - tolower(variable)) %>% - filter(databasename == myDatabasename) - - if(nrow(plotdata)==0){ - return(ggplot2::ggplot()) + - ggplot2::labs(title=paste('Database:', myDatabasename)) - } else { - - plotdata[plotdata=='NA'] <- 0 - colnames(plotdata) <- c('databasename','year','N','p0','p1','p5','p25','p50','p75','p95','p99','p100') - - plotResult <- ggplot2::ggplot(plotdata, ggplot2::aes(x=as.factor(year), - ymin= ifelse(is.na(p0), 0, as.double(as.character(p0))), - lower= ifelse(is.na(p25), 0, as.double(as.character(p25))), - middle= ifelse(is.na(p50), 0, as.double(as.character(p50))), - upper= ifelse(is.na(p75), 0, as.double(as.character(p75))), - ymax= ifelse(is.na(p100), 0, as.double(as.character(p100))), - color= as.factor(databasename))) + - ggplot2::geom_hline(yintercept=365, linetype="dashed", color = "black") + - ggplot2::geom_hline(yintercept=365*2, linetype="dashed", color = "black") + - ggplot2::geom_hline(yintercept=365*3, linetype="dashed", color = "black") + - ggplot2::geom_hline(yintercept=365*4, linetype="dashed", color = "black") + - ggplot2::geom_hline(yintercept=365*5, linetype="dashed", color = "black") + - #ggplot2::coord_flip() + - ggplot2::geom_boxplot(stat="identity") + - ggplot2::scale_x_discrete("Year") + - ggplot2::scale_y_continuous("Time in Days") + - ggplot2::theme(legend.position="none") + - ggplot2::labs(title=paste('Database:', myDatabasename)) - - return(plotResult) - } -} - diff --git a/inst/shiny/DiagnosticsExplorer/global.R b/inst/shiny/DiagnosticsExplorer/global.R deleted file mode 100644 index 4cebc84a5..000000000 --- a/inst/shiny/DiagnosticsExplorer/global.R +++ /dev/null @@ -1,120 +0,0 @@ -# check package dependancies - will move to launcher after study-a-thon -stop <- F -for(pkg in c('shiny', 'shinydashboard', 'DT', 'plotly', 'dplyr', 'tidyr','ggplot2', 'shinycssloaders')){ - if(is.na(tryCatch({utils::packageVersion(pkg)}, error = function(e) NA))){ - warning(paste0('Package ', pkg, ' not installed - please install')) - stop <- T - } -} -if(stop){stop('Need to install dependancies')} - -library(shiny) -library(shinydashboard) -library(DT) -library(plotly) -library(dplyr) -library(tidyr) -#library(scales) -#library(ggiraph) - - -rm(list=ls()[ls()%in%c('settings', 'namesdetails', 'characterization','distribution','proportion')]) -source("PlotsAndTables.R") - - -if (!exists("shinySettings")) { - if (file.exists("diagnostics")) { - shinySettings <- list(dataFolder = "diagnostics") - } else { - shinySettings <- list(dataFolder = "./diagnostics") - } - -} - -dataFolder <- shinySettings$dataFolder - -if (file.exists(file.path(dataFolder, "PreMerged.RData"))) { - writeLines("Using merged data detected in data folder") - load(file.path(dataFolder, "PreMerged.RData")) -} else { - zipFiles <- list.files(dataFolder, pattern = ".zip", full.names = TRUE) - - loadFile <- function(file, folder, overwrite) { - print(file) - tableName <- gsub(".csv$", "", file) - camelCaseName <- SqlRender::snakeCaseToCamelCase(tableName) - #data <- readr::read_csv(file.path(folder, file), col_types = readr::cols(), guess_max = 1e7, locale = readr::locale(encoding = "UTF-8")) - data <- utils::read.csv(file.path(folder, file)) - - colnames(data) <- SqlRender::snakeCaseToCamelCase(colnames(data)) - - if (!overwrite && exists(camelCaseName, envir = .GlobalEnv)) { - existingData <- get(camelCaseName, envir = .GlobalEnv) - if (nrow(existingData) > 0) { - if (nrow(data) > 0 && - all(colnames(existingData) %in% colnames(data)) && - all(colnames(data) %in% colnames(existingData))) { - data <- data[, colnames(existingData)] - } - - if (!isTRUE(all.equal(colnames(data), colnames(existingData), check.attributes = FALSE))) { - stop("Table columns do no match previously seen columns. Columns in ", - file, - ":\n", - paste(colnames(data), collapse = ", "), - "\nPrevious columns:\n", - paste(colnames(existingData), collapse = ", ")) - } - } - data <- rbind(existingData, data) - } - assign(camelCaseName, data, envir = .GlobalEnv) - - invisible(NULL) - } - - for (i in 1:length(zipFiles)) { - writeLines(paste("Processing", zipFiles[i])) - tempFolder <- tempfile() - dir.create(tempFolder) - unzip(zipFiles[i], exdir = tempFolder) - - csvFiles <- list.files(tempFolder, pattern = ".csv") - lapply(csvFiles, loadFile, folder = tempFolder, overwrite = (i == 1)) - - unlink(tempFolder, recursive = TRUE) - } -} - -namesdetails$names <- as.character(namesdetails$names) -# Fixing the labels (more to add) -getNames <- function(cohortid){ - res <- namesdetails %>% filter(ids == cohortid) %>% select(names) - res$names[1] -} -getId <- function(cohortname){ - res <- namesdetails %>% filter(names == cohortname) %>% select(ids) - res$ids[1] -} - -# Sort selectors -databases <- sort(as.list(unique(settings %>% select(cdmdatabasename)))$cdmdatabasename) - -outcomeCohorts <- lapply(as.list(unique(settings %>% select(outcomeid)))$outcomeid, getNames) -targetCohorts <- lapply(as.list(unique(settings %>% select(cohortid)))$cohortid, getNames) - -settings$tar <- unlist( - lapply(1:nrow(settings), function(x) paste0(settings$startanchor[x], ' + ', settings$riskwindowstart[x], - ' days - ', settings$endanchor[x], ' + ', settings$riskwindowend[x], ' days')) - ) -tars <- as.list(unique(settings %>% select(tar)))$tar -#tars <- unique(settings %>% select(riskwindowstart, startanchor, riskwindowend, endanchor)) -#tars <- lapply(1:nrow(tars), function(x) paste0(tars$startanchor[x], ' + ', tars$riskwindowstart[x], -# ' days - ', tars$endanchor[x], ' + ', tars$riskwindowend[x], ' days')) - -# Variable Selector -distributionVars <- c('daysFromObsStart','daysToObsEnd','daysToOutcomeAfterMin','daysToOutcomeBeforeMin') - - -writeLines("Data Loaded") - diff --git a/inst/shiny/DiagnosticsExplorer/html/about.html b/inst/shiny/DiagnosticsExplorer/html/about.html deleted file mode 100644 index 3f52b2bb3..000000000 --- a/inst/shiny/DiagnosticsExplorer/html/about.html +++ /dev/null @@ -1,4 +0,0 @@ -

Description

-

General informatiom about the tool. Also includes links to the protocol and code to execute the study

- - diff --git a/inst/shiny/DiagnosticsExplorer/html/characterizationInfo.html b/inst/shiny/DiagnosticsExplorer/html/characterizationInfo.html deleted file mode 100644 index a1563840b..000000000 --- a/inst/shiny/DiagnosticsExplorer/html/characterizationInfo.html +++ /dev/null @@ -1,3 +0,0 @@ -

Characterisation

- -Text to be added diff --git a/inst/shiny/DiagnosticsExplorer/html/databases.html b/inst/shiny/DiagnosticsExplorer/html/databases.html deleted file mode 100644 index 841fafa85..000000000 --- a/inst/shiny/DiagnosticsExplorer/html/databases.html +++ /dev/null @@ -1,8 +0,0 @@ -

Description

-

A table showing the meta data of the included databases in the Drug Utilization Study.

- -

Options

-

To be added

- - - diff --git a/inst/shiny/DiagnosticsExplorer/html/distributionInfo.html b/inst/shiny/DiagnosticsExplorer/html/distributionInfo.html deleted file mode 100644 index 8eda392bb..000000000 --- a/inst/shiny/DiagnosticsExplorer/html/distributionInfo.html +++ /dev/null @@ -1,4 +0,0 @@ -

Distributions/h3> - -Text to be added - diff --git a/inst/shiny/DiagnosticsExplorer/html/proportionInfo.html b/inst/shiny/DiagnosticsExplorer/html/proportionInfo.html deleted file mode 100644 index 3e07d9245..000000000 --- a/inst/shiny/DiagnosticsExplorer/html/proportionInfo.html +++ /dev/null @@ -1,3 +0,0 @@ -

Proportion

- -Text to be added diff --git a/inst/shiny/DiagnosticsExplorer/html/results.html b/inst/shiny/DiagnosticsExplorer/html/results.html deleted file mode 100644 index 748f8cf39..000000000 --- a/inst/shiny/DiagnosticsExplorer/html/results.html +++ /dev/null @@ -1,10 +0,0 @@ -

Description

-

Multiple tables and figures showing the results of the study.

- -

Options

-

On the left you first have to select the analysis for which you want to view the results. Other options become available, such as selectors for the ingredient, formulations etc. if these are applicable.

- -

A "Figures" tab becomes available for certain analyses on the top in the right part of the screen.

- - - diff --git a/inst/shiny/DiagnosticsExplorer/server.R b/inst/shiny/DiagnosticsExplorer/server.R deleted file mode 100644 index fe303f9e0..000000000 --- a/inst/shiny/DiagnosticsExplorer/server.R +++ /dev/null @@ -1,549 +0,0 @@ -prettyHr <- function(x) { - result <- sprintf("%.2f", x) - result[is.na(x)] <- "NA" - result <- suppressWarnings(format(as.numeric(result), big.mark=",")) # add thousands separator - return(result) -} - -addThousandsSeparator<-function(table){ - if(is.data.frame(table)){ - is.num <- sapply(table, is.numeric) - table[is.num] <- lapply(table[is.num], function(x) format(as.numeric(x), big.mark=",")) - return(table) - } else { - is.not.na<- !sapply(suppressWarnings(as.numeric(table)), is.na) - table[is.not.na] <- format(as.numeric(table[is.not.na]), big.mark=",") - return(table) - } - -} - -getHoveroverStyle <- function(left_px, top_px) { - style <- paste0("position:absolute; z-index:100; background-color: rgba(245, 245, 245, 0.85); ", - "left:", - left_px - 200, - "px; top:", - top_px - 130, - "px; width:400px;") -} - - - -shinyServer(function(input, output, session) { - - - # Tables - - output$proportionTable <- renderDT({ - - analysisId <- settings %>% filter(cdmdatabasename == input$pdatabase & - tar == input$ptar & - outcomeid == getId(input$poutcomeName) & - cohortid == getId(input$ptargetName) - ) %>% select(analysisid) - analysisId <-analysisId$analysisid[1] - - if(input$pxyear){ - proportionAll <- proportion %>% dplyr::filter(agegroup == 'all') - }else{ - proportionAll <- proportion %>% dplyr::filter(year == 'all') - } - - table <- proportionAll %>% filter(analysisid == analysisId) %>% - select(year,agegroup,gender,n,o,opercent) - - if(input$pgender){ - table <- table %>% filter(gender != -1) - }else{ - table <- table %>% filter(opercent != -1) %>% - group_by(year,agegroup) %>% - summarise(n = sum(as.double(as.character(n)), na.rm = T), - o = sum(as.double(as.character(o)), na.rm = T), - opercent = sum(o, na.rm = T)*100/sum(n, na.rm = T)) - } - - # get the correct columns - if(input$pxyear & input$pgender){ - table <-table %>% ungroup() %>% - select(year,gender,n,o,opercent) - } else if(!input$pxyear & input$pgender){ - table <-table %>% ungroup() %>% - select(agegroup,gender,n,o,opercent) - } else if(!input$pxyear & !input$pgender){ - table <-table %>% ungroup() %>% - select(agegroup,n,o,opercent) - } else if(input$pxyear & !input$pgender){ - table <- table %>% ungroup() %>% - select(year,n,o,opercent) - } - - table$opercent = as.numeric(prettyHr(table$opercent)) - result<-datatable(table, - filter="top", - options = list(pageLenth=25, - scrollX = TRUE, - dom='Blfrtip', - buttons=c('colvis','csv','excel')), - extensions = 'Buttons', - rownames = FALSE, - escape = FALSE, - class = "stripe nowrap compact") - return(result)} - ) - - output$proportionPlot <- renderPlotly({ - mySubplot <- function(myanalysisId,myDatabasename, myTar){ - - if(input$pxyear){ - proportionAll <- proportion %>% dplyr::filter(agegroup == 'all') - }else{ - proportionAll <- proportion %>% dplyr::filter(year == 'all') - } - - plotdata <- proportionAll %>% filter(analysisid == myanalysisId) - plotdataM <- plotdata %>% filter(gender == 8507) - plotdataF <- plotdata %>% filter(gender == 8532) - plotdataA <- plotdata %>% filter(opercent != -1) %>% - group_by(year,agegroup) %>% - summarise(n = sum(as.double(as.character(n)), na.rm = T), - o = sum(as.double(as.character(o)), na.rm = T), - opercent = sum(o, na.rm = T)*100/sum(n, na.rm = T)) %>% - ungroup() %>% data.frame() - - fig1 <- plot_ly() - if(input$pxyear){ - levels <- c(2005:2020) - } else{ - levels <- c('0 - 4','5 - 9','10 - 14','15 - 19', '20 - 24', - '25 - 29','30 - 34','35 - 39','40 - 44','45 - 49', - '50 - 54','55 - 59','60 - 64','65 - 69','70 - 74', - '75 - 79','80 - 84','85 - 89','90 - 94','95 - 99') - } - plotdataF$var <- factor(as.character(plotdataF[,ifelse(input$pxyear,'year','agegroup')]), - levels = levels,ordered = TRUE) - plotdataM$var <- factor(as.character(plotdataM[,ifelse(input$pxyear,'year','agegroup')]), - levels = levels, ordered = TRUE) - plotdataA$var <- factor(as.character(plotdataA[,ifelse(input$pxyear,'year','agegroup')]), - levels = levels, ordered = TRUE) - plotdataF <- plotdataF[order(plotdataF$var),] - plotdataM <- plotdataM[order(plotdataM$var),] - plotdataA <- plotdataA[order(plotdataA$var),] - - if(input$pgender){ - fig1 <- fig1 %>% - add_trace( - x = plotdataF$var, - y = plotdataF$opercent, - type = 'scatter', - name = '', - mode = 'lines+markers', - showlegend = FALSE, - hovertemplate = paste('',ifelse(input$pxyear,'year','agegroup'),': ',plotdataF$var, - '
T size ',plotdataF$n,'
', - 'O percent: %{y:.2f}', - '
') - ) - - fig1 <- fig1 %>% - add_trace( - x = plotdataM$var, - y = plotdataM$opercent, - type = 'scatter', - name = '', - mode = 'lines+markers', - showlegend = FALSE, - hovertemplate = paste('',ifelse(input$pxyear,'year','agegroup'),': ',plotdataM$var, - '
T size ',plotdataM$n,'
', - 'O percent: %{y:.2f}', - '
') - #markers=list(color='red',dash='dashed') - ) - } else{ - fig1 <- fig1 %>% - add_trace( - x = plotdataA$var, - y = plotdataA$opercent, - type = 'scatter', - name = '', - mode = 'lines+markers', - showlegend = FALSE, - hovertemplate = paste('',ifelse(input$pxyear,'year','agegroup'),': ',plotdataA$var, - '
T size ',plotdataA$n,'
', - 'O percent: %{y:.2f}', - '
') - ) - } - - fig1 - } - - subplots_list <- list() - ind <- 1 - for(i in 1:length(input$databases)){ - - analysisId <- settings %>% filter(cdmdatabasename == input$pdatabases[i] & - tar == input$ptar & - outcomeid == getId(input$poutcomeName) & - cohortid == getId(input$ptargetName) - ) %>% select(analysisid) - analysisId <-analysisId$analysisid[1] - - # if there is a result add it - if(!is.na(analysisId)){ - tempPlot <- mySubplot(analysisId, input$databases[i], input$ptar) - subplots_list[[ind]] <- tempPlot - ind <- ind+1 - } - } - - - - fig <- subplot(subplots_list, nrows=length(input$databases),shareX = F, shareY =F) - - return(fig) - }) - - - output$survivalPlot <- renderPlot({ - - data <- survival %>% - dplyr::filter(outcomeid == getId(input$soutcomeName)) %>% - dplyr::filter(cohortid == getId(input$stargetName)) %>% - dplyr::filter(cdmdatabasename == input$sdatabase) - - data <- data %>% dplyr::mutate(decreaseP = events/(events+natrisk)) - - yaxis <- lapply(unique(data$censoredtime), function(i) 1-sum(data %>% dplyr::filter(censoredtime <= i) %>% dplyr::select(decreaseP))) - - extra <- data.frame(censoredtime = unique(data$censoredtime), - yaxis = unlist(yaxis)) - - data <- data %>% inner_join(extra, by = 'censoredtime') - - library(ggplot2) - ggplot() + - geom_step(data=data, mapping=aes(x=censoredtime, y=yaxis)) + - #geom_step(data=d, mapping=aes(x=x, y=y), direction="vh", linetype=3) + - geom_point(data=data, mapping=aes(x=censoredtime, y=yaxis), color="red") + - geom_vline(xintercept = 365, linetype="dotted", - color = "black", size=1) + - geom_vline(xintercept = 2*365, linetype="dotted", - color = "black", size=1) + - geom_vline(xintercept = 3*365, linetype="dotted", - color = "black", size=1) + - geom_vline(xintercept = 4*365, linetype="dotted", - color = "black", size=1) + - geom_vline(xintercept = 5*365, linetype="dotted", - color = "black", size=1) + - geom_vline(xintercept = 10*365, linetype="dotted", - color = "black", size=1) + - ylim(0, 1) + labs(x = "Time from index (days)", - y = "Outcome free") - - }) - - - output$characterizationTable <- renderDT({ - - analysisId <- settings %>% filter(cdmdatabasename == input$cdatabase & - tar == input$ctar & - outcomeid == getId(input$coutcomeName) & - cohortid == getId(input$ctargetName) - ) %>% select(analysisid) - analysisId <-analysisId$analysisid[1] - - table <- characterization %>% filter(analysisid == analysisId) - table$covariatemeanwithoutcome = as.numeric(prettyHr(table$covariatemeanwithoutcome)) - table$covariatemeanwithnooutcome = as.numeric(prettyHr(table$covariatemeanwithnooutcome)) - result<-datatable(table, - filter="top", - options = list(pageLenth=25, - scrollX = TRUE, - dom ='Blfrtip', - buttons=c('colvis','csv','excel')), - extensions = 'Buttons', - rownames = FALSE, - escape = FALSE, - class = "stripe nowrap compact") - return(result)} - ) - - - output$characterizationPlot <- renderPlotly({ - mySubplot <- function(myanalysisId,myDatabasename, myTar){ - - plotdata <- characterization %>% filter(analysisid == myanalysisId) - - plotdataMeas <- plotdata %>% filter(covariatemeanwithnooutcome > 1 | covariatemeanwithoutcome > 1) - plotdata <- plotdata %>% filter(covariatemeanwithnooutcome < 1 & covariatemeanwithoutcome < 1) - - - fig1 <- plot_ly() - fig2 <- plot_ly() - - fig1 <- fig1 %>% - add_trace( - type = 'scatter', - name = '', - mode = 'markers', - x = plotdata$covariatemeanwithnooutcome, - y = plotdata$covariatemeanwithoutcome, - showlegend = FALSE, - hovertemplate = paste(plotdata$covariatename,'
', - 'Mean with outcome: %{y:.2f}', - '
Mean with no outcome: %{x}
') - ) %>% - add_trace(x=c(0, 1), y=c(0, 1), - type="scatter", mode="lines", name='x = y') %>% - - layout(annotations = list(x = 0.5, y = 1.05, - text=sprintf("Database: %s - TAR: %s",myDatabasename,myTar ), - xref='paper', yref='paper', showarrow=F), - xaxis = list(title = "Non-outcomes"), - yaxis = list(title = "Outcomes"), - showlegend = FALSE) - - if(nrow(plotdataMeas)>0){ - fig2 <- fig2 %>% - add_trace( - type = 'scatter', - name = '', - mode = 'markers', - x = plotdataMeas$covariatemeanwithnooutcome, - y = plotdataMeas$covariatemeanwithoutcome, - showlegend = FALSE, - hovertemplate = paste(plotdataMeas$covariatename,'
', - 'Mean with outcome: %{y:.2f}', - '
Mean with no outcome: %{x}
') - ) %>% - add_trace(x=c(0, max(c(plotdataMeas$covariatemeanwithnooutcome,plotdataMeas$covariatemeanwithoutcome))), - y=c(0, max(c(plotdataMeas$covariatemeanwithnooutcome,plotdataMeas$covariatemeanwithoutcome))), - type="scatter", mode="lines", name='x = y') %>% - layout(#xaxis = list(title = "Varible mean of non-outcomes"), - #yaxis = list(title = "Varible mean of outcomes"), - showlegend = FALSE) - } - - list(fig1,fig2) - - } - - subplots_list <- list() - ind <- 1 - for(i in 1:length(input$databases)){ - - analysisId <- settings %>% filter(cdmdatabasename == input$cdatabases[i] & - tar == input$ctar & - outcomeid == getId(input$coutcomeName) & - cohortid == getId(input$ctargetName) - ) %>% select(analysisid) - analysisId <-analysisId$analysisid[1] - - # if there is a result add it - if(!is.na(analysisId)){ - tempPlot <- mySubplot(analysisId, input$databases[i], input$ctar) - subplots_list[[ind]] <- tempPlot[[1]] - ind <- ind+1 - subplots_list[[ind]] <- tempPlot[[2]] - ind <- ind+1 - } - } - - - - fig <- subplot(subplots_list, nrows=length(input$databases),shareX = F, shareY =F) - - return(fig) - }) - - - - - - - output$distributionTable <- renderDataTable({ - table <- getPercentileTable(distribution,getId(input$targetName),getId(input$outcomeName),input$database,tolower(input$variable)) - options(digits = 2) - - selection = list(mode = "single", target = "row") - table <- datatable( - table, - extensions = c('Buttons','FixedColumns'), - options = list( - aoColumnDefs = list(list(className= 'dt-left', targets = "_all")), - pageLength = 50, - ordering = FALSE, - dom ='Blfrtip', - scrollX = TRUE, - fixedColumns = TRUE, - buttons = - list( - 'copy', - 'print', - list( - extend = 'collection', - buttons = c('colvis','csv', 'excel'), - text = 'Download' - ) - ) - ), - #options = options, - selection = selection, - rownames = FALSE, - escape = FALSE, - class = "stripe nowrap compact" - ) - return(table) - - }) - - # Plot titles - output$distributionTimePlotTitle <- renderText( - paste0(input$variable, " over the years in ", paste(input$databases, collapse=', ')) - ) - - # Plots - output$distributionTimePlot <- renderPlotly({ - mySubplot <- function(myDatabasename){ - fig <- plot_ly() - plotdata <- - getPercentileTable(distribution, - getId(input$targetName), - getId(input$outcomeName), - NULL, - tolower(input$variable)) %>% - filter(databasename == myDatabasename) - date <- as.Date(paste(plotdata$year, "01", "01", sep = ""), "%Y%m%d") - fig <- fig %>% - add_trace( - type = 'scatter', - name = '', - mode = 'lines+markers', - x = date, - y = as.numeric(plotdata$`50%`), - showlegend = FALSE, - hovertemplate = paste(myDatabasename,'
', - 'Median: %{y:.2f}', - '
Year: %{x}
') - ) %>% - - add_trace( - type = 'scatter', - name = '', - mode = 'lines+markers', - x = date, - y = as.numeric(plotdata$`95%`), - showlegend = FALSE, - hovertemplate = paste(myDatabasename,'
', - 'P95: %{y:.2f}', - '
Year: %{x}
') - ) %>% - layout(annotations = list(x = 0.5, y = 1.05, text=sprintf("%s",myDatabasename), xref='paper', yref='paper', showarrow=F)) - } - - subplots_list <- list() - for(i in 1:length(input$databases)){ - subplots_list[[i]] <- mySubplot(input$databases[i]) - } - - fig <- subplot(subplots_list, nrows=length(input$databases),shareX = T, shareY =T) - - return(fig) - }) - - output$distributionBoxPlot <- renderPlot({ - - subplots_list <- list() - for(i in 1:length(input$databases)){ - subplots_list[[i]] <- myBoxplot(input$databases[i],getId(input$targetName),getId(input$outcomeName),input$variable) - } - - require(gridExtra) - fig <- do.call("grid.arrange", c(subplots_list, ncol=length(input$databases))) - - return(fig) - }) - - getdistributionTooltip <- function( - top_px, - point) { - text <- 'Test hoover' - #text <- gsub("-", "<", sprintf("%s proportion: %0.3f per 1000 persons", proportionType, point$proportion)) - # if (!is.na(point$ageGroup)) { - # text <- paste(text, sprintf("Age group: %s years", point$ageGroup), sep = "
") - # top_px <- top_px - 15 - # } - # if (!is.na(point$gender)) { - # text <- paste(text, sprintf("Gender: %s", point$gender), sep = "
") - # top_px <- top_px - 15 - # } - # if (!is.na(point$calendarYear)) { - # text <- paste(text, sprintf("Calendar year: %s", point$calendarYear), sep = "
") - # top_px <- top_px - 15 - # } - # if (!is.na(point$cohortCount)) { - # text <- paste(text, sprintf("%s patients: %s", proportionType, scales::comma(point$cohortCount)), sep = "
") - # top_px <- top_px - 15 - # } - # if (!is.na(point$numPersons)) { - # text <- paste(text, sprintf("Denominator: %s", scales::comma(point$numPersons)), sep = "
") - # top_px <- top_px - 15 - # } - # text <- paste(text, sprintf("Database: %s", point$databaseId), sep = "
") - return(list(top_px = top_px, text = text)) - } - - output$hoverInfoDistribution <- renderUI({ - data <- getPercentileTable(distribution,getId(input$targetName),getId(input$outcomeName),NULL,tolower(input$variable)) %>% - filter(databasename %in% input$databases) - if (is.null(data)) { - return(NULL) - }else { - hover <- input$plotHoverDistribution - point <- nearPoints(data, hover, threshold = 5, maxpoints = 1, addDist = TRUE) - if (nrow(point) == 0) { - return(NULL) - } - left_px <- hover$coords_css$x - top_px <- hover$coords_css$y - - tooltip <- getProportionTooltip(top_px, point) - style <- getHoveroverStyle(left_px = left_px, top_px = tooltip$top_px) - div( - style = "position: relative; width: 0; height: 0", - wellPanel( - style = style, - p(HTML(tooltip$text)) - ) - ) - } - }) - - # Functionality for help messages - showInfoBox <- function(title, htmlFileName) { - showModal(modalDialog( - title = title, - easyClose = TRUE, - footer = NULL, - size = "l", - HTML(readChar(htmlFileName, file.info(htmlFileName)$size) ) - )) - } - - observeEvent(input$aboutInfo, { - showInfoBox("About", "html/about.html") - }) - - observeEvent(input$incidenceInfo, { - showInfoBox("Study Results", "html/incidenceInfo.html") - }) - - observeEvent(input$characterizationInfo, { - showInfoBox("Study Results", "html/characterizationInfo.html") - }) - - observeEvent(input$distributionInfo, { - showInfoBox("Study Results", "html/distributionInfo.html") - }) - -}) diff --git a/inst/shiny/DiagnosticsExplorer/ui.R b/inst/shiny/DiagnosticsExplorer/ui.R deleted file mode 100644 index 7e6226464..000000000 --- a/inst/shiny/DiagnosticsExplorer/ui.R +++ /dev/null @@ -1,214 +0,0 @@ -library(shinydashboard) -library(shiny) -library(DT) -library(plotly) -library(shinyWidgets) - -addInfo <- function(item, infoId) { - infoTag <- tags$small( - class = "badge pull-right action-button", - style = "padding: 1px 6px 2px 6px; background-color: steelblue;", - type = "button", - id = infoId, - "i" - ) - item$children[[1]]$children <- - append(item$children[[1]]$children, list(infoTag)) - return(item) -} - -dashboardPage( - dashboardHeader(title = "Diagnostics Explorer"), - dashboardSidebar( - sidebarMenu( - id = "tabs", - ## Tabs - addInfo(menuItem("About", tabName = "about"), "aboutInfo"), - addInfo(menuItem("Proportion", tabName = "proportion"), "proportionInfo"), - addInfo(menuItem("Survival", tabName = "survival"), "survivalInfo"), - addInfo(menuItem("Characterization", tabName = "characterization"), "characterizationInfo"), - addInfo(menuItem("Distribution", tabName = "distribution"), "distributionInfo"), - - ## Option panel - - # propoortion options - # characterization options - conditionalPanel( - condition = "input.tabs=='proportion'", - selectInput("ptargetName", "Target", targetCohorts), - selectInput("poutcomeName", "Outcome", outcomeCohorts), - selectInput("ptar","Time-at-risk",tars), - shinyWidgets::switchInput("pgender", "Gender Split",value = TRUE, width = '80%'), - shinyWidgets::switchInput("pxyear",value = TRUE, "By year") - ), - conditionalPanel( - condition = "input.tabs=='proportion' && input.proportionTabsetPanel == 'Tables'", - selectInput("pdatabase", "Database", databases)), - - conditionalPanel(condition = "input.tabs == 'proportion' && (input.proportionTabsetPanel == 'Figure' )", - hr(), - checkboxGroupInput("pdatabases", "Database", databases, selected = databases[1]) - ), - - - # survival - conditionalPanel( - condition = "input.tabs=='survival'", - selectInput("stargetName", "Target", targetCohorts), - selectInput("soutcomeName", "Outcome", outcomeCohorts), - selectInput("sdatabase", "Database", databases) - ), - - - # distribution options - conditionalPanel( - condition = "input.tabs=='distribution' && input.distributionTabsetPanel == 'Tables'", - selectInput("database", "Database", databases)), - conditionalPanel( - condition = "input.tabs=='distribution'", - - selectInput("targetName", "Target", targetCohorts), - selectInput("outcomeName", "Outcome", outcomeCohorts), - selectInput("variable","distributionVar",distributionVars) - ), - conditionalPanel(condition = "input.tabs == 'distribution' && (input.distributionTabsetPanel == 'Time Trend' | input.distributionTabsetPanel == 'Box Plot' )", - hr(), - checkboxGroupInput("databases", "Database", databases, selected = databases[1]) - ), - # characterization options - conditionalPanel( - condition = "input.tabs=='characterization'", - selectInput("ctargetName", "Target", targetCohorts), - selectInput("coutcomeName", "Outcome", outcomeCohorts), - selectInput("ctar","Time-at-risk",tars) - ), - conditionalPanel( - condition = "input.tabs=='characterization' && input.characterizationTabsetPanel == 'Tables'", - selectInput("cdatabase", "Database", databases)), - - conditionalPanel(condition = "input.tabs == 'characterization' && (input.characterizationTabsetPanel == 'Figure' )", - hr(), - checkboxGroupInput("cdatabases", "Database", databases, selected = databases[1]) - ) - ) - - ), - dashboardBody( - - tags$body(tags$div(id="ppitest", style="width:1in;visible:hidden;padding:0px")), - tags$script('$(document).on("shiny:connected", function(e) { - var w = window.innerWidth; - var h = window.innerHeight; - var d = document.getElementById("ppitest").offsetWidth; - var obj = {width: w, height: h, dpi: d}; - Shiny.onInputChange("pltChange", obj); - }); - $(window).resize(function(e) { - var w = $(this).width(); - var h = $(this).height(); - var d = document.getElementById("ppitest").offsetWidth; - var obj = {width: w, height: h, dpi: d}; - Shiny.onInputChange("pltChange", obj); - }); - '), - - tabItems( - tabItem( - tabName = "about", - br(), - p( - "This interactive web-based application provides Diagnostics for a Patient-Level Prediction study." - ), - h3("Rationale and background"), - p( - " The idea is to first generate Diagnostics for the prediction problem before the execution. [MORE TO ADD]" - ), - h3("External links"), - HTML("

Below are links for study-related artifacts that have been made available as part of this study:

"), - HTML("
    "), - HTML("
  • The study is registered here (to add)"), - HTML("
  • The full source code for the study will be made available once the study is finalized"), - HTML("
"), - h3("Development Status"), - p( - " The results in this application are currently under review and should be treated as preliminary at this moment." - ) - ) - , - tabItem(tabName = "proportion", - tabsetPanel( - id = "proportionTabsetPanel", - tabPanel( - "Figure", - box( - width = 12, - br(), - shinycssloaders::withSpinner(plotlyOutput("proportionPlot")) - ) - ), - tabPanel( - "Tables", - shinycssloaders::withSpinner(dataTableOutput("proportionTable")) - ) - ) - ) - , - - tabItem(tabName = "survival", - box( - width = 12, - br(), - shinycssloaders::withSpinner(plotOutput("survivalPlot")) - ) - - - ), - - tabItem(tabName = "characterization", - tabsetPanel( - id = "characterizationTabsetPanel", - tabPanel( - "Tables", - shinycssloaders::withSpinner(dataTableOutput("characterizationTable")) - ), - tabPanel( - "Figure", - box( - width = 12, - br(), - shinycssloaders::withSpinner(plotlyOutput("characterizationPlot")) - ) - ) - ) - ), - tabItem(tabName = "distribution", - tabsetPanel( - id = "distributionTabsetPanel", - tabPanel( - "Tables", - dataTableOutput("distributionTable") - ), - tabPanel( - "Time Trend", - box( - title = textOutput("distributionTimePlotTitle"), - width = 12, - br(), - shinycssloaders::withSpinner(plotlyOutput("distributionTimePlot")) - ) - ), - tabPanel( - "Box Plot", - box( - width = 12, - br(), - shinycssloaders::withSpinner(plotOutput("distributionBoxPlot")) - ) - ) - - ) - ) - - ) - ) -) diff --git a/inst/shiny/DiagnosticsExplorer/www/favicon.ico b/inst/shiny/DiagnosticsExplorer/www/favicon.ico deleted file mode 100644 index 849a1fa44..000000000 Binary files a/inst/shiny/DiagnosticsExplorer/www/favicon.ico and /dev/null differ diff --git a/inst/shiny/DiagnosticsExplorer/www/logo.png b/inst/shiny/DiagnosticsExplorer/www/logo.png deleted file mode 100644 index c6307af6b..000000000 Binary files a/inst/shiny/DiagnosticsExplorer/www/logo.png and /dev/null differ diff --git a/inst/shiny/PLPViewer/PLPViewer.Rproj b/inst/shiny/PLPViewer/PLPViewer.Rproj deleted file mode 100644 index 8e3c2ebc9..000000000 --- a/inst/shiny/PLPViewer/PLPViewer.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/inst/shiny/PLPViewer/databaseExtras.R b/inst/shiny/PLPViewer/databaseExtras.R deleted file mode 100644 index 07aeab2ed..000000000 --- a/inst/shiny/PLPViewer/databaseExtras.R +++ /dev/null @@ -1,407 +0,0 @@ -# repo files - -editColnames <- function(cnames, edits){ - lwcnames <- tolower(cnames) - - for(edit in edits){ - if(tolower(edit)%in%lwcnames){ - cnames[tolower(edit)==lwcnames] <- edit - } - } - return(cnames) - -} - -editTar <- function(summaryTable){ - - summaryTable <- summaryTable %>% dplyr::mutate(TAR = paste0('(',trimws(.data$tarStartAnchor),' + ',.data$tarStartDay, ') - (',trimws(.data$tarEndAnchor),' + ',.data$tarEndDay, ')' )) %>% - dplyr::select(-c(.data$tarStartAnchor, .data$tarStartDay, .data$tarEndAnchor, .data$tarEndDay)) - - return(summaryTable) -} - - -getDbSummary <- function(con, mySchema, targetDialect, myTableAppend = '' ){ - ParallelLogger::logInfo("gettingDb summary") - - sql <- "SELECT distinct s.study_id, - results.result_id, - results.model_id as analysis_id, - results.researcher_id, - d.database_acronym AS Dev, - d.database_acronym AS Val, - targets.cohort_name AS T, outcomes.cohort_name AS O, - model_settings.model_type AS model, - model_designs.covariate_setting_id, - tars.tar_start_day, tars.tar_start_anchor, tars.tar_end_day, tars.tar_end_anchor, - ROUND(aucResult.auc, 3) as auc, - ROUND(auprcResult.auprc,4) as auprc, - nResult.population_size, - oResult.outcome_count, - ROUND(nTest.test_size*100.0/nResult.population_size, 1) as eval_percent, - ROUND(oResult.outcome_count*100.0/nResult.population_size,4) as outcome_percent - - FROM @my_schema.@my_table_appendresults AS results INNER JOIN @my_schema.@my_table_appendmodels AS models - ON results.model_id = models.model_id and - results.database_id = models.database_id - - inner join @my_schema.@my_table_appendmodel_designs as model_designs - on model_designs.model_design_id = models.model_design_id and - results.target_id = model_designs.target_id and - results.outcome_id = model_designs.outcome_id and - results.tar_id = model_designs.tar_id and - results.population_setting_id = model_designs.population_setting_id - - inner join @my_schema.@my_table_appendmodel_settings as model_settings - on model_settings.model_setting_id = model_designs.model_setting_id - - - INNER JOIN @my_schema.@my_table_appendstudy_models AS s on models.model_id = s.model_id - - - LEFT JOIN (SELECT cohort_id, cohort_name FROM @my_schema.@my_table_appendcohorts) AS targets ON results.target_id = targets.cohort_id - LEFT JOIN (SELECT cohort_id, cohort_name FROM @my_schema.@my_table_appendcohorts) AS outcomes ON results.outcome_id = outcomes.cohort_id - LEFT JOIN @my_schema.@my_table_appenddatabase_details AS d ON results.database_id = d.database_id - LEFT JOIN @my_schema.@my_table_appendtars AS tars ON results.tar_id = tars.tar_id - LEFT JOIN (SELECT result_id, value AS auc FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'AUROC' and evaluation in ('Test','Validation') ) AS aucResult ON results.result_id = aucResult.result_id - LEFT JOIN (SELECT result_id, value AS auprc FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'AUPRC' and evaluation in ('Test','Validation') ) AS auprcResult ON results.result_id = auprcResult.result_id - LEFT JOIN (SELECT result_id, sum(value) AS population_size FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'populationSize' and evaluation in ('Test','Train') group by result_id) AS nResult ON results.result_id = nResult.result_id - LEFT JOIN (SELECT result_id, sum(value) AS outcome_count FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'outcomeCount' and evaluation in ('Test','Train') group by result_id) AS oResult ON results.result_id = oResult.result_id - LEFT JOIN (SELECT result_id, value AS test_size FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'populationSize' and evaluation = 'Test') AS nTest ON results.result_id = nTest.result_id;" - - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - my_table_append = myTableAppend) - - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - summaryTable <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(summaryTable) <- SqlRender::snakeCaseToCamelCase(colnames(summaryTable)) - - summaryTable$t <- trimws(summaryTable$t) - summaryTable$o <- trimws(summaryTable$o) - - summaryTable <- summaryTable %>% - dplyr::rename(`Covariate setting` = covariateSettingId) %>% - dplyr::rename(`T Size` = populationSize) %>% - dplyr::rename(`O Count` = outcomeCount) %>% - dplyr::rename(`Val (%)` = evalPercent) %>% - dplyr::rename(`O Incidence (%)` = outcomePercent) - - summaryTable <- editTar(summaryTable) - - colnames(summaryTable) <- editColnames(cnames = colnames(summaryTable), - edits = c('AUC','AUPRC', 'T', 'O', 'Dev','Val', 'TAR', 'Model')) - - summaryTable$timeStamp <- 0 - summaryTable$Analysis <- summaryTable$analysisId - ParallelLogger::logInfo("Got db summary") - return(summaryTable[,c('Dev', 'Val', 'T','O', 'Model','Covariate setting', - 'TAR', 'AUC', 'AUPRC', - 'T Size', 'O Count','Val (%)', 'O Incidence (%)', 'timeStamp', 'analysisId', 'researcherId', 'resultId', 'Analysis', 'studyId')]) - -} - - -getValSummary <- function(con, mySchema, modelId, targetDialect, myTableAppend = '' ){ - ParallelLogger::logInfo("getting Val summary") - - sql <- "SELECT results.result_id, results.model_id as analysis_id, - results.researcher_id, - --databases.database_acronym AS Dev, - d.database_acronym AS Val, - targets.cohort_name AS T, outcomes.cohort_name AS O, - model_settings.model_type AS model, - model_designs.covariate_setting_id, - tars.tar_start_day, tars.tar_start_anchor, tars.tar_end_day, tars.tar_end_anchor, - ROUND(aucResult.auc, 3) as auc, - ROUND(auprcResult.auprc,4) as auprc, - nResult.population_size, - oResult.outcome_count, - ROUND(nTest.test_size*100.0/nResult.population_size, 1) as eval_percent, - ROUND(oResult.outcome_count*100.0/nResult.population_size,4) as outcome_percent, - ROUND(calibration_in_large, 3) as calibration_in_large - - FROM @my_schema.@my_table_appendresults AS results INNER JOIN @my_schema.@my_table_appendmodels AS models - ON - results.model_id = models.model_id AND - results.model_id = @model_id - - inner join @my_schema.@my_table_appendmodel_designs as model_designs - on model_designs.model_design_id = models.model_design_id and - results.tar_id = model_designs.tar_id and - results.population_setting_id = model_designs.population_setting_id - - inner join @my_schema.@my_table_appendmodel_settings as model_settings - on model_settings.model_setting_id = model_designs.model_setting_id - - LEFT JOIN (SELECT cohort_id, cohort_name FROM @my_schema.@my_table_appendcohorts) AS targets ON results.target_id = targets.cohort_id - LEFT JOIN (SELECT cohort_id, cohort_name FROM @my_schema.@my_table_appendcohorts) AS outcomes ON results.outcome_id = outcomes.cohort_id - LEFT JOIN @my_schema.@my_table_appenddatabase_details AS d ON results.database_id = d.database_id - LEFT JOIN @my_schema.@my_table_appendtars AS tars ON results.tar_id = tars.tar_id - LEFT JOIN (SELECT result_id, value AS auc FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'AUROC' and evaluation in ('Test','Validation') ) AS aucResult ON results.result_id = aucResult.result_id - LEFT JOIN (SELECT result_id, value AS auprc FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'AUPRC' and evaluation in ('Test','Validation') ) AS auprcResult ON results.result_id = auprcResult.result_id - - LEFT JOIN (SELECT result_id, value AS calibration_in_large FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'calibrationInLarge intercept' and evaluation in ('Test','Validation') ) AS CalibrationInLargeResult ON results.result_id = CalibrationInLargeResult.result_id - - LEFT JOIN (SELECT result_id, sum(value) AS population_size FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'populationSize' and evaluation in ('Test','Train','Validation') group by result_id) AS nResult ON results.result_id = nResult.result_id - LEFT JOIN (SELECT result_id, sum(value) AS outcome_count FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'outcomeCount' and evaluation in ('Test','Train','Validation') group by result_id) AS oResult ON results.result_id = oResult.result_id - LEFT JOIN (SELECT result_id, value AS test_size FROM @my_schema.@my_table_appendevaluation_statistics where metric = 'populationSize' and evaluation in ('Test','Validation')) AS nTest ON results.result_id = nTest.result_id;" - - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - model_id = modelId, - my_table_append = myTableAppend) - - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - valTable <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(valTable) <- SqlRender::snakeCaseToCamelCase(colnames(valTable)) - - valTable$t <- trimws(valTable$t) - valTable$o <- trimws(valTable$o) - - valTable <- valTable %>% - dplyr::rename(`Covariate setting` = covariateSettingId) %>% - dplyr::rename(`T Size` = populationSize) %>% - dplyr::rename(`O Count` = outcomeCount) %>% - dplyr::rename(`Val (%)` = evalPercent) %>% - dplyr::rename(`O Incidence (%)` = outcomePercent) - - valTable <- editTar(valTable) - - #colnames(valTable) <- editColnames(cnames = colnames(valTable), - # edits = c('AUC','AUPRC', 'T', 'O', 'Dev','Val', 'TAR', 'Model')) - colnames(valTable) <- editColnames(cnames = colnames(valTable), - edits = c('AUC','AUPRC', 'T', 'O','Val', 'TAR', 'Model')) - - valTable$timeStamp <- 0 - valTable$Analysis <- valTable$analysisId - ParallelLogger::logInfo("got db summary") - #return(valTable[,c('Dev', 'Val', 'T','O', 'Model','Covariate setting', - return(valTable[,c('Val', 'T','O', 'Model','Covariate setting', - 'TAR', 'AUC', 'AUPRC', 'calibrationInLarge', - 'T Size', 'O Count','Val (%)', 'O Incidence (%)', 'timeStamp', 'analysisId', 'researcherId', 'resultId', 'Analysis')]) - -} - -getResult <- function(con, tableName, resultId, mySchema, targetDialect){ - sql <- "SELECT * FROM @my_schema.@table_name WHERE result_id = @result_id" - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - table_name = tableName, - result_id = resultId) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - result <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(result) <- SqlRender::snakeCaseToCamelCase(colnames(result)) - return(result) -} - - -loadPlpFromDb <- function(chosenRow, mySchema, con, val = F, targetDialect, myTableAppend = ''){ - resultId <- chosenRow$resultId - modelId <- chosenRow$analysisId - researcherId <- chosenRow$researcherId - result <- list() - result$performanceEvaluation <- list() - - if (!val){ - print(paste0('model: ', modelId)) - ## get hyper_param_search plpResult$model$hyperParamSearch <- ... - #old sql <- "SELECT population_setting_id, model_setting_id, covariate_setting_id, hyper_param_search FROM @my_schema.@my_table_appendmodels AS models WHERE model_id = @model_id;" - sql <- "SELECT population_setting_id, model_setting_id, covariate_setting_id, hyper_param_search FROM - @my_schema.@my_table_appendmodel_designs md inner join - @my_schema.@my_table_appendmodels AS models - on models.model_design_id = md.model_design_id - WHERE models.model_id = @model_id;" - - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - model_id = modelId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - ParallelLogger::logInfo("starting population, model setting and covariate setting") - - ids <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(ids) <- SqlRender::snakeCaseToCamelCase(colnames(ids)) - - ParallelLogger::logInfo("finishing population, model setting and covariate setting") - - popSetId <- ids$populationSettingId - modSetId <- ids$modelSettingId - covSetId <- ids$covariateSettingId - - hyperParamSearch <- jsonlite::unserializeJSON(ids$hyperParamSearch) - - #covariateSummary - #made this null to speed up programme - result$covariateSummary <- NULL - - #inputSetting - result$model <- list(settings = list()) - - sql <- "SELECT * FROM @my_schema.@my_table_appendmodel_settings AS model_setting WHERE model_setting_id = @model_setting_id" - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - model_setting_id = modSetId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - ParallelLogger::logInfo("start modeSet") - - tempModSettings <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(tempModSettings) <- SqlRender::snakeCaseToCamelCase(colnames(tempModSettings)) - ParallelLogger::logInfo("end modeSet") - - if(length(tempModSettings$modelSettingsJson)>0){ - result$model$settings$modelSettings <- jsonlite::unserializeJSON(tempModSettings$modelSettingsJson) - } else{ - result$model$settings$modelSettings <- list('missing', list(param = 'na')) - } - - sql <- "SELECT * FROM @my_schema.@my_table_appendcovariate_settings AS covariate_setting WHERE covariate_setting_id = @covariate_setting_id" - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - covariate_setting_id = covSetId, - my_table_append = myTableAppend - ) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - ParallelLogger::logInfo("start covSet") - tempCovSettings <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(tempCovSettings) <- SqlRender::snakeCaseToCamelCase(colnames(tempCovSettings)) - ParallelLogger::logInfo("end covSet") - - if(length(tempCovSettings$covariateSettingsJson)>0){ - # old result$inputSetting$dataExtrractionSettings$covariateSettings <- jsonlite::unserializeJSON(tempCovSettings$covariateSettingsJson) - result$model$settings$covariateSettings <- jsonlite::fromJSON(tempCovSettings$covariateSettingsJson, simplifyVector = T, simplifyDataFrame = F, simplifyMatrix = T) - - extractAttributes <- function(x){ - - ind <- grep('attr_', names(x)) - - if(length(ind)>0){ - attributeValues <- x[ind] - x <- x[-ind] - names(attributeValues) <- gsub(pattern = 'attr_',replacement = '',x = names(attributeValues)) - attributeValues$names <- names(x) - attributes(x) <- attributeValues - } - - return(x) - } - - result$model$settings$covariateSettings <- lapply(result$model$settings$covariateSettings, function(x) extractAttributes(x)) - } else{ - result$model$settings$covariateSettings <- list() - } - - - sql <- "SELECT * FROM @my_schema.@my_table_appendpopulation_settings AS population_settings WHERE population_setting_id = @population_setting_id" - ParallelLogger::logInfo("start popSet") - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - population_setting_id = popSetId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - tempPopSettings <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(tempPopSettings) <- SqlRender::snakeCaseToCamelCase(colnames(tempPopSettings)) - ParallelLogger::logInfo("end popSet") - - if(length(tempPopSettings$populationSettingsJson)>0){ - result$model$settings$populationSettings <- jsonlite::unserializeJSON(tempPopSettings$populationSettingsJson) - } else{ - result$model$settings$populationSettings <- NULL - } - - # attrition - sql <- "SELECT * FROM @my_schema.@my_table_appendattrition WHERE result_id = @result_id" - ParallelLogger::logInfo("start attrition") - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - result_id = resultId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - tempAttrition <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(tempAttrition) <- SqlRender::snakeCaseToCamelCase(colnames(tempAttrition)) - ParallelLogger::logInfo("end popSet") - - result$model$trainDetails$attrition <- as.data.frame(tempAttrition) - - result$performanceEvaluation$demographicSummary <- getResult(con, paste0(myTableAppend,'demographic_summary'), resultId, mySchema, targetDialect = targetDialect) - result$performanceEvaluation$demographicSummary$evaluation <- trimws(result$performanceEvaluation$demographicSummary$evaluation) - result$performanceEvaluation$demographicSummary$ageGroup <- trimws(result$performanceEvaluation$demographicSummary$ageGroup) - result$performanceEvaluation$demographicSummary$genGroup <- trimws(result$performanceEvaluation$demographicSummary$genGroup) - colnames(result$performanceEvaluation$demographicSummary) <- editColnames(colnames(result$performanceEvaluation$demographicSummary), c('evaluation',"PersonCountAtRisk","PersonCountWithOutcome", "StDevPredictedProbability", - "MinPredictedProbability", "P25PredictedProbability", "P50PredictedProbability", - "P75PredictedProbability", "MaxPredictedProbability")) - result$performanceEvaluation$predictionDistribution <- getResult(con, paste0(myTableAppend,'prediction_distribution'), resultId,mySchema, targetDialect = targetDialect ) - result$performanceEvaluation$predictionDistribution$evaluation <- trimws(result$performanceEvaluation$predictionDistribution$evaluation) - result$performanceEvaluation$predictionDistribution$class <- result$performanceEvaluation$predictionDistribution$classLabel - colnames(result$performanceEvaluation$predictionDistribution) <- editColnames(colnames(result$performanceEvaluation$predictionDistribution), c('evaluation', "PersonCount", "StDevPredictedProbability", - "MinPredictedProbability", "P05PredictedProbability" , - "P25PredictedProbability", "MedianPredictedProbability", - "P75PredictedProbability" , "P95PredictedProbability","MaxPredictedProbability")) - - - result$model$trainDetails$hyperParamSearch <- hyperParamSearch - - } - - #performanceEvaluation - result$performanceEvaluation$evaluationStatistics <- getResult(con, paste0(myTableAppend,'evaluation_statistics'), resultId, mySchema, targetDialect = targetDialect ) - - result$performanceEvaluation$thresholdSummary <- getResult(con, paste0(myTableAppend,'threshold_summary'), resultId,mySchema, targetDialect = targetDialect) - result$performanceEvaluation$thresholdSummary$evaluation <- trimws(result$performanceEvaluation$thresholdSummary$evaluation) - - result$performanceEvaluation$calibrationSummary <- getResult(con, paste0(myTableAppend,'calibration_summary'), resultId,mySchema, targetDialect = targetDialect) - result$performanceEvaluation$calibrationSummary$evaluation <- trimws(result$performanceEvaluation$calibrationSummary$evaluation) - colnames(result$performanceEvaluation$calibrationSummary) <- editColnames(colnames(result$performanceEvaluation$calibrationSummary), c('evaluation',"PersonCountAtRisk","PersonCountWithOutcome","StDevPredictedProbability", - "MinPredictedProbability", "P25PredictedProbability", "MedianPredictedProbability", - "P75PredictedProbability", "MaxPredictedProbability")) - - - sql <- "SELECT researcher_name, researcher_email FROM @my_schema.@my_table_appendresearchers AS researchers WHERE researcher_id = @researcher_id" - ParallelLogger::logInfo("start researchers") - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - researcher_id = researcherId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - result$researcherInfo <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(result$researcherInfo) <- SqlRender::snakeCaseToCamelCase(colnames(result$researcherInfo)) - - ParallelLogger::logInfo("end researchers") - result$model_id <- modelId - - # add intercept - ParallelLogger::logInfo("start intercept") - sql <- "SELECT intercept FROM @my_schema.@my_table_appendmodels AS models WHERE model_id = @model_id" - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - model_id = modelId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - coefficient <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(coefficient) <- SqlRender::snakeCaseToCamelCase(colnames(coefficient)) - - result$model$model <- list(coefficient = coefficient$intercept[1]) - - #hack so the check in plot,utlipl.. doesnt break it - result$analysisRef <- "" - result$executionSummary <- "" - class(result) <- "runPlp" - ParallelLogger::logInfo("end") - return(result) -} - - - - - - - - diff --git a/inst/shiny/PLPViewer/emptyPlot.R b/inst/shiny/PLPViewer/emptyPlot.R deleted file mode 100644 index cac00fec5..000000000 --- a/inst/shiny/PLPViewer/emptyPlot.R +++ /dev/null @@ -1,14 +0,0 @@ -emptyPlot <- function(title = NULL){ - p <- plotly::plotly_empty(type = "scatter", mode = "markers") %>% - plotly::config( - displayModeBar = FALSE - ) %>% - plotly::layout( - title = list( - text = title, - yref = "paper", - y = 0.5 - ) - ) - return(p) -} diff --git a/inst/shiny/PLPViewer/global.R b/inst/shiny/PLPViewer/global.R deleted file mode 100644 index 7fd64720e..000000000 --- a/inst/shiny/PLPViewer/global.R +++ /dev/null @@ -1,118 +0,0 @@ -# EDIT FOR REPO OR DATABASE -useDatabase <- .GlobalEnv$shinySettings$useDatabase -usePlpObject <- .GlobalEnv$shinySettings$usePlpObject -useFileSystem <- .GlobalEnv$shinySettings$useFileSystem - -pathToMd <- "./www/shinyDescription.md" - - -# set default database values -connectionDetails <- NULL -mySchema <- NULL -targetDialect <- NULL -myTableAppend <- '' - -# extract data if database -if(useDatabase){ - - ParallelLogger::logInfo('Extracting results from database') - - result <- 'database' - inputType <- 'database' - validation <- NULL - - source("databaseExtras.R") - mySchema <- Sys.getenv("shinydbSchema") - myServer <- Sys.getenv("shinydbServer") - myUser <- Sys.getenv("shinydbUser") - myPassword <- Sys.getenv("shinydbPw") - targetDialect <- Sys.getenv("shinydbDbms") - myPort <- Sys.getenv("shinydbPort") - - myTableAppend <- Sys.getenv("shinydbTableAppend") - - if(myPort != ""){ - ParallelLogger::logInfo('Port') - ParallelLogger::logInfo(paste(myPort)) - con <- pool::dbPool(drv = DatabaseConnector::DatabaseConnectorDriver(), - dbms = targetDialect, - server = myServer, - user = myUser, - password = myPassword, - port = myPort) - - } else{ - ParallelLogger::logInfo('No Port') - con <- pool::dbPool(drv = DatabaseConnector::DatabaseConnectorDriver(), - dbms = targetDialect, - server = myServer, - user = myUser, - password = myPassword) - - } - - onStop(function() { - if (DBI::dbIsValid(con)) { - ParallelLogger::logInfo("Closing connection pool") - pool::poolClose(con) - } - }) - - summaryTable <- getDbSummary(con = con, - mySchema = mySchema, - targetDialect = targetDialect, - myTableAppend = myTableAppend) - -} - - -# if plpObect -if(usePlpObject){ - source("processing.R") - ParallelLogger::logInfo('Loading results from plpObject') - - if(!is.null(.GlobalEnv$shinySettings$result)){ - - result <- .GlobalEnv$shinySettings$result - - if(class(result)=='runPlp'){ - inputType <- 'plpResult' - } else if(sum(names(result)%in%c("prediction","performanceEvaluation","inputSetting","executionSummary","model","analysisRef","covariateSummary"))==7){ - inputType <- 'plpNoClass' - } else { - stop('Incorrect class for input result') - } - - validation <- .GlobalEnv$shinySettings$validation - - summaryTable <- getSummary(inputType = inputType, - result = result, - validation = validation) - } - -} - -# if fileSystem -if(useFileSystem){ - source("processing.R") - ParallelLogger::logInfo('Loading results from file system') - - if(!is.null(.GlobalEnv$shinySettings$result)){ - - valid <- ifelse(class(.GlobalEnv$shinySettings$result)=='character', dir.exists(.GlobalEnv$shinySettings$result),F) - - if(valid){ - - result <- 'file' - validation <- NULL - inputType <- 'file' - summaryTable <- getSummary(inputType = 'file', - result = .GlobalEnv$shinySettings$result) - - }else{ - print(paste0('invalid directory: ', .GlobalEnv$shinySettings$result)) - } - } -} - - diff --git a/inst/shiny/PLPViewer/helpers.R b/inst/shiny/PLPViewer/helpers.R deleted file mode 100644 index 250ef7f7c..000000000 --- a/inst/shiny/PLPViewer/helpers.R +++ /dev/null @@ -1,75 +0,0 @@ -# need to add mySchema and connectionDetails to input -getPlpResult <- function(result, - validation, - summaryTable, - inputType, - resultRow, - val = F, - mySchema = NULL, - connectionDetails = NULL, - targetDialect = NULL, - myTableAppend = NULL){ - -##ind <- resultRow() - - if(!is.null(resultRow())){ - print('Loading data') - print(paste0('input: ', inputType)) - - if(inputType == 'database'){ - tempResult <- loadPlpFromDb(summaryTable[resultRow(),], mySchema, con, val = val, targetDialect, myTableAppend) - return(tempResult) - } else if(inputType == 'plpResult'){ - i <- resultRow() - if(i == 1){ - tempResult <- result - tempResult$type <- 'test' - }else{ - tempResult <- validation$validation[[i-1]] - tempResult$type <- 'validation' - } - }else if(inputType == 'plpNoClass'){ - tempResult <- result - tempResult$type <- 'validation' - }else if( inputType == 'file') { - - # support rds, csv and runPlp objects - tempResult <- NULL - loc <- summaryTable[resultRow(),]$plpResultLocation - locLoaderFunc <- summaryTable[resultRow(),]$plpResultLoad - - if(dir.exists(as.character(loc))){ - tempResult <- do.call(as.character(locLoaderFunc), list(as.character(loc))) - tempResult$type <- ifelse(length(grep('/Validation',loc))>0,'validation','test') - } - }else { - stop('Incorrect class') - } - return(tempResult) - } else{ - return(NULL) - } -} - - - - -addInfo <- function(item, infoId) { - infoTag <- tags$small(class = "badge pull-right action-button", - style = "padding: 1px 6px 2px 6px; background-color: steelblue;", - type = "button", - id = infoId, - "i") - item$children[[1]]$children <- append(item$children[[1]]$children, list(infoTag)) - return(item) -} - -showInfoBox <- function(title, htmlFileName) { - shiny::showModal(shiny::modalDialog( - title = title, - easyClose = TRUE, - footer = NULL, - size = "l", - shiny::HTML(readChar(htmlFileName, file.info(htmlFileName)$size) ) - )) -} diff --git a/inst/shiny/PLPViewer/html/DataInfo.html b/inst/shiny/PLPViewer/html/DataInfo.html deleted file mode 100644 index 2b8e163f8..000000000 --- a/inst/shiny/PLPViewer/html/DataInfo.html +++ /dev/null @@ -1,4 +0,0 @@ -

Description

-

This button provides information about the data used in the app

- - \ No newline at end of file diff --git a/inst/shiny/PLPViewer/html/Description.html b/inst/shiny/PLPViewer/html/Description.html deleted file mode 100644 index 72e455ca2..000000000 --- a/inst/shiny/PLPViewer/html/Description.html +++ /dev/null @@ -1,3 +0,0 @@ -

Description

-

Information about the study and links to the code used to run the study

- \ No newline at end of file diff --git a/inst/shiny/PLPViewer/html/Help.html b/inst/shiny/PLPViewer/html/Help.html deleted file mode 100644 index 68c76d851..000000000 --- a/inst/shiny/PLPViewer/html/Help.html +++ /dev/null @@ -1,5 +0,0 @@ -

Description

-

This button provides a link to a YouTube video with a demonstration of the shiny app

- - - \ No newline at end of file diff --git a/inst/shiny/PLPViewer/html/Log.html b/inst/shiny/PLPViewer/html/Log.html deleted file mode 100644 index e47baf294..000000000 --- a/inst/shiny/PLPViewer/html/Log.html +++ /dev/null @@ -1,20 +0,0 @@ -

Description

- - -

This button shows the log when the model was developed or validated

- - - -

Selecting Model

- - -

To select a model to inspect either:

- -
    -
  • i) Select it from the dropdowm menu on the left hand side
  • - -
  • or ii) Click on a row in the summary view to select it - this will show as the row will be highlighted.
  • - -
- - \ No newline at end of file diff --git a/inst/shiny/PLPViewer/html/Model.html b/inst/shiny/PLPViewer/html/Model.html deleted file mode 100644 index 41fbfc0b6..000000000 --- a/inst/shiny/PLPViewer/html/Model.html +++ /dev/null @@ -1,27 +0,0 @@ -

Description

- -

The model button shows a plot and table with the characteristics of the patients with and without the outcome during the time-at-risk.

- - - -

Interpretation

-
    -
  • The plots show each covariate as a dot (binary covariates on the left side plot and measurements on the right side plot). The x-axis is the fraction of patients (or mean value) with the covariate in the patients without the outcome and the y-axis is the fraction of patients (or mean value) with the covariate in the patients with the outcome. Dots above the x=y line are more common in patients with the outcome and dots below the line are more common in patients without the outcome.
  • -
  • The table shows the covariate name, the variable importance or coefficient value, the mean value in those with and without the outcome and the standardized mean difference.
  • -
- - - -

Selecting Model

- - -

To select a model to inspect either:

- -
    -
  • i) Select it from the dropdowm menu on the left hand side
  • - -
  • or ii) Click on a row in the summary view to select it - this will show as the row will be highlighted.
  • - -
- - \ No newline at end of file diff --git a/inst/shiny/PLPViewer/html/Performance.html b/inst/shiny/PLPViewer/html/Performance.html deleted file mode 100644 index e01c313fc..000000000 --- a/inst/shiny/PLPViewer/html/Performance.html +++ /dev/null @@ -1,31 +0,0 @@ -

Description

- - -

The performance of the model including the operating characteristics at different risk cutoffs, the overall discrimination and the overall calibration.

- - - -

Tabs

- - -

The three tabs are:

-
    -
  • The 'Threshold dependant' tab shows the prediction question being explores and various operating characteristics for a range of risk cutoffs (the threshold bar is interactive and enables you to explore different values by moving the bar left or right)
  • -
  • The 'Discrimination' tab shows the AUROC, AUPRC, predicted risk distributions and F1 score
  • -
  • The 'Calibration' tab shows the generic calibration plot and the calibration per age group and gender.
  • -
- - -

Selecting Model

- - -

To select a model to inspect either:

- -
    -
  • i) Select it from the dropdowm menu on the left hand side
  • - -
  • or ii) Click on a row in the summary view to select it - this will show as the row will be highlighted.
  • - -
- - diff --git a/inst/shiny/PLPViewer/html/Settings.html b/inst/shiny/PLPViewer/html/Settings.html deleted file mode 100644 index 9b7b625e5..000000000 --- a/inst/shiny/PLPViewer/html/Settings.html +++ /dev/null @@ -1,20 +0,0 @@ -

Description

- - -

Tabs containing the settings (model/population/covariate) selected when developing the model

- - - -

Selecting Model

- - -

To select a model to inspect either:

- -
    -
  • i) Select it from the dropdowm menu on the left hand side
  • - -
  • or ii) Click on a row in the summary view to select it - this will show as the row will be highlighted.
  • - -
- - diff --git a/inst/shiny/PLPViewer/html/Summary.html b/inst/shiny/PLPViewer/html/Summary.html deleted file mode 100644 index 8df81cacb..000000000 --- a/inst/shiny/PLPViewer/html/Summary.html +++ /dev/null @@ -1,51 +0,0 @@ -

Description

- -

A table showing summary information for each validation result. Each row corresponds to a model applied to a specific target population, outcome and time-at-risk triple for a specific database. Summary details include the validation data size (target population and outcome counts) and discriminative performance.

- - - -

Options

- -

Click on a row to select it - this will show as the row will be highlighted. This will populate the following parts of the app for further exploration:

- -
    - -
  • The complete performance of the result for the selected row can be viewed by clicking on the 'Performance' button in the left menu
  • - -
  • The model corresponding to the result for the selected row can be viewed by clicking on the 'Model' button in the left menu
  • - -
  • The log file corresponding to the result for the selected row can be viewed by clicking on the 'Log' button in the left menu (this is not always available)
  • - -
  • The model development settings for the selected row can be viewed by clicking on the 'Settings' button and then 'Model Settings' tab (this is not always available)
  • - -
  • The population settings (information about time-at-risk and exclusions) for the selected row can be viewed by clicking on the 'Settings' button and then 'Population Settings' tab (this is not always available)
  • - -
  • The covariate settings (information about the model features) for the selected row can be viewed by clicking on the 'Settings' button and then 'Covariate Settings' tab (this is not always available)
  • - -
- - - -

Using the Filter

- - -

Select a specific:

- - -
    - -
  • development database - database used to develop the model being validated
  • - -
  • validation database - database used to evaluate the model being validated
  • - -
  • time-at-risk - time period relative to index where the outcome is being predicted
  • - -
  • target population - the patient population we are interested in predicting the outcome risk for
  • - -
  • outcome - the event being predicted
  • -
  • model - the type of model (e.g., logistic regression, decision tree)
  • - -
- - -

to filter the table rows of interest.

diff --git a/inst/shiny/PLPViewer/html/boxHelp.html b/inst/shiny/PLPViewer/html/boxHelp.html deleted file mode 100644 index d9ff1b38b..000000000 --- a/inst/shiny/PLPViewer/html/boxHelp.html +++ /dev/null @@ -1,6 +0,0 @@ -

Description

-

These plots show the box plots displaying the risk distributions for those with the outcome during the time-at-risk (class 1) and those without the outcome during the time-at-risk (class 0)

- -

Interpretation

-

If a model is able to discriminate between those with and without the outcome then it should be assigning a higher risk to those with the outcome, so the box plot for class 1 should be shifted to the right relative to the box plot for class 0. If the model is not able to discriminate then the box plots will look similar.

- diff --git a/inst/shiny/PLPViewer/html/calHelp.html b/inst/shiny/PLPViewer/html/calHelp.html deleted file mode 100644 index 9013fc297..000000000 --- a/inst/shiny/PLPViewer/html/calHelp.html +++ /dev/null @@ -1,6 +0,0 @@ -

Description

-

The calibration plots show how closely the predicted risk matched the true observed risk. The calibration plot is calculated (using labelled data) by partitioning the patients into deciles based on predicted risk and then within each decile the mean predicted risk is calculated and the fraction of patients with the outcome (the observed risk) is calculated. The calibration plot is then generated by plotting the observed risk against the mean predicted risk for each decile.

- -

Interpretation

-

If a model is well calibrated the mean predicted risk should be approximately the same as the observed risk. Therefor all 10 dots should fall on the x=y line. If the dots fall above the x=y line then there is a higher oberved risk than predicted, so our model is assigning lower than the true risk to patients (underestimated risk). If the dots fall below the x=y line then there is a lower observed risk than predicted, so our model is assigning higher than the true risk to patients (overestimated risk).

- diff --git a/inst/shiny/PLPViewer/html/demoHelp.html b/inst/shiny/PLPViewer/html/demoHelp.html deleted file mode 100644 index 2b6eaac56..000000000 --- a/inst/shiny/PLPViewer/html/demoHelp.html +++ /dev/null @@ -1,8 +0,0 @@ -

Description

-

The demographic calibration plots show how closely the predicted risk matched the true observed risk for each age/gender strata. We partition the patients into age and gender groups, then calculate the mean predicted risk within each age/gender group and the fraction of patients within the group that actually had the outcome during the time-at-risk (observed risk). We then plot the observed and predicted risk for each age group split by gender.

- -

Interpretation

-

If a model is well calibrated the mean predicted risk should be approximately the same as the observed risk for each age/gender. Therefore, the observed risk and predicted risk plots should overlap. If there is deviation between the predicted risk and observed risk for a certain age group, then this tells us the model is not well calibrated for that age group. This may indicate the need to fit a model specifically for that age group if there is sufficient data.

- -

In addition, this plot shows us the age trend of risk (e.g., you can see whether the risk increases as patients age) and it shows us how males and females differ in terms of risk of the outcome during the time-at-risk.

- diff --git a/inst/shiny/PLPViewer/html/f1Help.html b/inst/shiny/PLPViewer/html/f1Help.html deleted file mode 100644 index e4c59cc6c..000000000 --- a/inst/shiny/PLPViewer/html/f1Help.html +++ /dev/null @@ -1,13 +0,0 @@ -

Description

-

The F1 score plot shows the F1 score for each risk threshold. Click here for more information about the F1 score.

- -

Interpretation

-

The F1-score combines the sensitivity and precision of the model into a single measure of accuracy.

- -

Definitions

-
    -
  • Sensitivity - probability that somebody with the outcome will be identified as having the outcome by the model at a specified cutoff (e.g., their predicted risk >= specified cutoff) -
  • -
  • Precision (positive predictive value) - probability that somebody identified by the model as having the outcome at a specified cutoff truly has the outcome -
  • -
diff --git a/inst/shiny/PLPViewer/html/prcHelp.html b/inst/shiny/PLPViewer/html/prcHelp.html deleted file mode 100644 index 3416a13ab..000000000 --- a/inst/shiny/PLPViewer/html/prcHelp.html +++ /dev/null @@ -1,19 +0,0 @@ -

Description

-

The precision recall (PR) curve shows the trade-off between precision (positive predictive value) and recall (sensitivity) for all possible risk cutoffs. The area below the curve is a measure of overall discriminative performance. Click here for more information.

- -

Interpretation

-

The red dashed line shows the fraction of the target population who have the outcome (the average risk). The main line shows the relationship between the precision and recall. If the main line is above the red dashed line, then this means the model is able to identify a group of patients who have a higher risk than the average risk, the higher the line is above the red dashed line, the higher the relative risk we can identify for some subset of patients. -

- -

Notes

-

If the outcome is rare (so the data are imbalanced) a precision recall curve (PRC) gives an insight into the clinical utility of the model as it tells you about the precision of the model

- -

Definitions

-
    -
  • Sensitivity (recall) - probability that somebody with the outcome will be identified as having the outcome by the model at a specified cutoff (e.g., their predicted risk >= specified cutoff) -
  • -
  • - Specificity - probability that somebody without the outcome will be identified as a non-outcome by the model at a specified cutoff (e.g., their predicted risk < specified cutoff)
  • -
  • Precision (positive predictive value) - probability that somebody identified by the model as having the outcome at a specified cutoff truly has the outcome -
  • -
diff --git a/inst/shiny/PLPViewer/html/predDistHelp.html b/inst/shiny/PLPViewer/html/predDistHelp.html deleted file mode 100644 index bbccd8585..000000000 --- a/inst/shiny/PLPViewer/html/predDistHelp.html +++ /dev/null @@ -1,6 +0,0 @@ -

Description

-

These plots show the probability density function for those with the outcome (red) and those without the outcome (green)

- -

Interpretation

-

If a prediction model is able to discriminate between those who and without the outcome during the time-at-risk then these distributions should be disjoint. The more overlap between the distributions, the worse the discrimination.

- diff --git a/inst/shiny/PLPViewer/html/prefDistHelp.html b/inst/shiny/PLPViewer/html/prefDistHelp.html deleted file mode 100644 index ba0339fab..000000000 --- a/inst/shiny/PLPViewer/html/prefDistHelp.html +++ /dev/null @@ -1,6 +0,0 @@ -

Description

-

These plots show the preference score density function for those with the outcome (red) and those without the outcome (green)

- -

Interpretation

-

If a prediction model is able to discriminate between those who and without the outcome during the time-at-risk then these distributions should be disjoint. The more overlap between the distributions, the worse the discrimination.

- diff --git a/inst/shiny/PLPViewer/html/rocHelp.html b/inst/shiny/PLPViewer/html/rocHelp.html deleted file mode 100644 index f11c74f8d..000000000 --- a/inst/shiny/PLPViewer/html/rocHelp.html +++ /dev/null @@ -1,18 +0,0 @@ -

Description

-

The receiver operating characteristic (ROC) curve shows the trade-off between sensitivity and specificity for all possible risk cutoffs. The area below the curve is a measure of overall discriminative performance. Click here for more information.

- -

Interpretation

-

If a model is not able to discriminate then the curve will be approximately the x=y line. A perfectly discriminative model will go up vertically and then across.

- -

Notes

-

If the outcome is rare then the ROC curve doesn't provide insight into the precision of the model and a precision recall curve (PRC) should also be inspected.

- -

Definitions

-
    -
  • Sensitivity - probability that somebody with the outcome will be identified as having the outcome by the model at a specified cutoff (e.g., their predicted risk >= specified cutoff) -
  • -
  • - Specificity - probability that somebody without the outcome will be identified as a non-outcome by the model at a specified cutoff (e.g., their predicted risk < specified cutoff)
  • -
  • Precision (positive predictive value) - probability that somebody identified by the model as having the outcome at a specified cutoff truly has the outcome -
  • -
diff --git a/inst/shiny/PLPViewer/modules/calibration.R b/inst/shiny/PLPViewer/modules/calibration.R deleted file mode 100644 index fa5d52186..000000000 --- a/inst/shiny/PLPViewer/modules/calibration.R +++ /dev/null @@ -1,318 +0,0 @@ -calibrationViewer <- function(id) { - ns <- shiny::NS(id) - shiny::div( - - shiny::fluidRow( - shinydashboard::box( - status = 'info', width = 12, - title = 'Summary', - solidHeader = TRUE, - shiny::p('Click on one of these rows to view corresponding plots:'), - DT::dataTableOutput(ns('calTable') - ) - ) - ), - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("calHelp"), - "Calibration Plot", - icon = shiny::icon("info") - ), - solidHeader = TRUE, - shinycssloaders::withSpinner(shiny::plotOutput(ns('cal'))) - ), - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("demoHelp"), - "Demographic Plot", - icon = shiny::icon("info") - ), - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner(shiny::plotOutput(ns('demo'))) - ) - ) - ) -} - -calibrationServer <- function(id, plpResult) { - shiny::moduleServer( - id, - function(input, output, session) { - - sumTable <- shiny::reactive({ - data <- plpResult()$performanceEvaluation$evaluationStatistics - - for(i in 1:ncol(data)){ - data[,i] <- unlist(data[,i]) - } - - data$value <- as.double(as.character(data$value)) - data$value <- format(data$value, digits = 4, scientific = F) - ind <- data$metric %in% c( - 'calibrationInLarge intercept', - 'weak calibration intercept', - 'weak calibration gradient', - 'calibrationInLarge mean prediction', - 'calibrationInLarge observed risk', - 'ici', - 'Emean', - 'E90', - 'Emax', - 'correctionFactor', - 'adjustGradient', - 'adjustIntercept' - ) - - tidyr::pivot_wider( - data[ind,], - names_from = 'metric', - values_from = 'value' - ) - #reshape2::dcast(data[ind,], evaluation ~ metric, value.var = 'value') - - }) - - output$calTable <- DT::renderDataTable({ - if(is.null(plpResult()$performanceEvaluation)){ - DT::datatable(NULL) - } else{ - result <- sumTable() - row.names(result) <- NULL - DT::datatable(result, selection = 'single') - } - }) - - output$cal <- shiny::renderPlot({ - type <- trimws(sumTable()$evaluation[input$calTable_rows_selected]) - print(type) - tryCatch( - {plotSparseCalibration2( - evaluation = plpResult()$performanceEvaluation, - type = type) - }, - error = function(err){emptyPlot(title = err)} - ) - }) - - output$demo <- shiny::renderPlot({ - type <- trimws(sumTable()$evaluation[input$calTable_rows_selected]) - tryCatch( - plotDemographicSummary( - evaluation = plpResult()$performanceEvaluation, - type = type - ), - error= function(cond){return(NULL)} - ) - }) - - - shiny::observeEvent(input$calHelp, { - shiny::showInfoBox("Calibration Help", "html/calHelp.html") - }) - shiny::observeEvent(input$demoHelp, { - shiny::showInfoBox("Demographic Help", "html/demoHelp.html") - }) - - - } - ) -} - - - -plotDemographicSummary <- function(evaluation, type = NULL, fileName=NULL){ - if (!all(is.na(evaluation$demographicSummary$averagePredictedProbability))){ - - ind <- 1:nrow(evaluation$demographicSummary) - if(is.null(type)){ - if(!is.null(evaluation$demographicSummary$evaluation)){ - ind <- evaluation$demographicSummary$evaluation%in%c('Test','validation') - } - } else{ - ind <- evaluation$demographicSummary$evaluation==type - } - - x<- evaluation$demographicSummary[ind,colnames(evaluation$demographicSummary)%in%c('ageGroup','genGroup','averagePredictedProbability', - 'PersonCountAtRisk', 'PersonCountWithOutcome')] - - - # remove -1 values: - x$averagePredictedProbability[is.na(x$averagePredictedProbability)] <- 0 - x <- x[x$PersonCountWithOutcome != -1,] - if(nrow(x)==0){ - return(NULL) - } - - x$observed <- x$PersonCountWithOutcome/x$PersonCountAtRisk - - - x <- x[,colnames(x)%in%c('ageGroup','genGroup','averagePredictedProbability','observed')] - - # if age or gender missing add - if(sum(colnames(x)=='ageGroup')==1 && sum(colnames(x)=='genGroup')==0 ){ - x$genGroup = rep('Non', nrow(x)) - evaluation$demographicSummary$genGroup = rep('Non', nrow(evaluation$demographicSummary)) - } - if(sum(colnames(x)=='ageGroup')==0 && sum(colnames(x)=='genGroup')==1 ){ - x$ageGroup = rep('-1', nrow(x)) - evaluation$demographicSummary$ageGroup = rep('-1', nrow(evaluation$demographicSummary)) - - } - - x <- tidyr::pivot_longer( - data = x, - cols = !colnames(x)[colnames(x) %in% c('ageGroup','genGroup')], - names_to = 'variable', - values_to = 'value' - ) - #x <- reshape2::melt(x, id.vars=c('ageGroup','genGroup')) - - # 1.96*StDevPredictedProbability - ci <- evaluation$demographicSummary[ind,colnames(evaluation$demographicSummary)%in%c('ageGroup','genGroup','averagePredictedProbability','StDevPredictedProbability')] - ci$StDevPredictedProbability[is.na(ci$StDevPredictedProbability)] <- 1 - ci$lower <- ci$averagePredictedProbability-1.96*ci$StDevPredictedProbability - ci$lower[ci$lower <0] <- 0 - ci$upper <- ci$averagePredictedProbability+1.96*ci$StDevPredictedProbability - ci$upper[ci$upper >1] <- max(ci$upper[ci$upper <1]) - - x$age <- gsub('Age group:','', x$ageGroup) - x$age <- factor(x$age,levels=c(" 0-4"," 5-9"," 10-14", - " 15-19"," 20-24"," 25-29"," 30-34"," 35-39"," 40-44", - " 45-49"," 50-54"," 55-59"," 60-64"," 65-69"," 70-74", - " 75-79"," 80-84"," 85-89"," 90-94"," 95-99","-1"),ordered=TRUE) - - - - x <- merge(x, ci[,c('ageGroup','genGroup','lower','upper')], by=c('ageGroup','genGroup')) - x <- x[!is.na(x$value),] - - plot <- ggplot2::ggplot(data=x, - ggplot2::aes(x=age, - group=interaction(variable,genGroup))) + - - ggplot2::geom_line(ggplot2::aes(y=value, group=variable, - color=variable, - linetype = variable))+ - ggplot2::geom_ribbon(data=x[x$variable!='observed',], - ggplot2::aes(ymin=lower, ymax=upper - , group=genGroup), - fill="blue", alpha=0.2) + - ggplot2::facet_grid(.~ genGroup, scales = "free") + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) + - #ggplot2::coord_flip() + - ggplot2::scale_y_continuous("Fraction") + - ggplot2::scale_x_discrete("Age") + - ggplot2::scale_color_manual(values = c("royalblue4","red"), - guide = ggplot2::guide_legend(title = NULL), - labels = c("Expected", "Observed")) + - - ggplot2::guides(linetype=FALSE) - - if (!is.null(fileName)) - ggplot2::ggsave(fileName, plot, width = 7, height = 4.5, dpi = 400) - return(plot) - } -} - -plotSparseCalibration2 <- function(evaluation, - smooth = "loess", - span = 1, - nKnots = 5, - scatter = T, - bins = 20, - zoom = "data", - sample = T, - fileName = NULL, - type = NULL) { - - ind <- 1:nrow(evaluation$calibrationSummary) - - if(is.null(type)){ - if(!is.null(evaluation$calibrationSummary$evaluation)){ - ind <- evaluation$calibrationSummary$evaluation%in%c('Test','validation') - } - } else{ - ind <- evaluation$calibrationSummary$evaluation == type - } - # use calibrationSummary - sparsePred <- evaluation$calibrationSummary[ind,] - - limVal <- max(max(sparsePred$averagePredictedProbability),max(sparsePred$observedIncidence)) - - smooth_plot <- ggplot2::ggplot(data = sparsePred, ggplot2::aes(x = averagePredictedProbability, - y = observedIncidence)) + - ggplot2::stat_smooth(ggplot2::aes(color = "Loess", linetype = "Loess"), - method = "loess", - se = TRUE, - #span = span, - size = 1, - show.legend = F) + - ggplot2::geom_segment(ggplot2::aes(x = 0, - xend = 1, - y = 0, - yend = 1, - color = "Ideal", - linetype = "Ideal")) + - ggplot2::coord_cartesian(xlim = c(0,limVal), - ylim = c(0,limVal)) + - ggplot2::scale_linetype_manual(name = "Models", - values = c(Loess = "solid", - Ideal = "dashed")) + - ggplot2::scale_color_manual(name = "Models", values = c(Loess = "blue", Ideal = "red")) + - ggplot2::labs(x = "Predicted Probability", y = "Observed Probability") - - # construct the plot grid - if (scatter) { - smooth_plot <- smooth_plot + ggplot2::geom_point(data = sparsePred, - ggplot2::aes(x = averagePredictedProbability, - y = observedIncidence), - color = "black", - size = 2) - } - - # Histogram object detailing the distibution of event/noevent for each probability interval - - popData1 <- sparsePred[,c('averagePredictedProbability', 'PersonCountWithOutcome')] - popData1$Label <- "Outcome" - colnames(popData1) <- c('averagePredictedProbability','PersonCount',"Label") - popData2 <- sparsePred[,c('averagePredictedProbability', 'PersonCountAtRisk')] - popData2$Label <- "No Outcome" - popData2$PersonCountAtRisk <- -1*(popData2$PersonCountAtRisk -popData1$PersonCount) - colnames(popData2) <- c('averagePredictedProbability','PersonCount',"Label") - popData <- rbind(popData1, popData2) - popData$averagePredictedProbability <- factor(popData$averagePredictedProbability) - hist_plot <- ggplot2::ggplot(popData, ggplot2::aes(y = averagePredictedProbability, x = PersonCount, - fill = Label)) + - ggplot2::geom_bar(data = subset(popData,Label == "Outcome"), stat = "identity") + - ggplot2::geom_bar(data = subset(popData,Label == "No Outcome"), stat = "identity") + - ggplot2::geom_bar(stat = "identity") + - ggplot2::scale_x_continuous(labels = abs) + - #ggplot2::scale_fill_brewer(palette = "Set1") + - ggplot2::coord_flip( ) + - ggplot2::theme_bw() + - ggplot2::theme(axis.title.x=ggplot2::element_blank(), - axis.text.x=ggplot2::element_blank(), - axis.ticks.x=ggplot2::element_blank()) - - # testting whether this is installed in shinydeploy - plot <- gridExtra::grid.arrange(smooth_plot, - hist_plot, - ncol = 1, - heights=c(2,1)) - - #plot <- cowplot::plot_grid(smooth_plot, - # hist_plot, - # ncol = 1, - # axis = "lr", - # align = "v", - # rel_heights = c(1, 0.6)) - - if (!is.null(fileName)) - ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400) - return(plot) -} diff --git a/inst/shiny/PLPViewer/modules/covariateSummary.R b/inst/shiny/PLPViewer/modules/covariateSummary.R deleted file mode 100644 index 34fe1dac4..000000000 --- a/inst/shiny/PLPViewer/modules/covariateSummary.R +++ /dev/null @@ -1,283 +0,0 @@ -covariateSummaryViewer <- function(id) { - ns <- shiny::NS(id) - - shiny::div( - shiny::fluidRow( - shinydashboard::box( - status = 'info', - title = "Binary", - solidHeader = TRUE, - shinycssloaders::withSpinner( - plotly::plotlyOutput( - ns('covariateSummaryBinary') - ) - ) - ), - shinydashboard::box( - status = 'info', - title = "Measurements", - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner( - plotly::plotlyOutput( - ns('covariateSummaryMeasure') - ) - ) - ) - ), - - shiny::fluidRow( - width=12, - shinydashboard::box( - status = 'info', width = 12, - title = "Covariates", solidHeader = TRUE, - DT::dataTableOutput(ns('modelCovariateInfo')) - ) - ), - shiny::fluidRow( - width=12, - shinydashboard::box(status = 'info', - width = 12, - title = "Model Table", - solidHeader = TRUE, - shiny::downloadButton("downloadData", "Download Model"), - DT::dataTableOutput(ns('modelView')) - ) - ) - ) - -} - -covariateSummaryServer <- function(id, plpResult, summaryTable, resultRow, mySchema, con, - inputSingleView, - myTableAppend = '', - targetDialect = NULL) { - shiny::moduleServer( - id, - function(input, output, session) { - - covariateSummary <- shiny::reactive({ - if(inputSingleView == "Model"){ - if(is.null(plpResult()$covariateSummary)){ - covariateSummary <- tryCatch( - {loadCovSumFromDb(summaryTable[resultRow(),], mySchema, con, myTableAppend, targetDialect)}, - error = function(e){return(NULL)} - ) - print(colnames(covariateSummary)) # temp for debugging - return(covariateSummary) - } else{ - return(plpResult()$covariateSummary) - } - } - }) - - # covariate table - output$modelView <- DT::renderDataTable( - editCovariates(covariateSummary())$table, - colnames = editCovariates(covariateSummary())$colnames - ) - - - output$modelCovariateInfo <- DT::renderDataTable( - data.frame( - covariates = nrow(covariateSummary()), - nonZeroCount = sum(covariateSummary()$covariateValue!=0, na.rm = T), - intercept = getIntercept(plpResult()) - ) - ) - - # covariate model plots - covs <- shiny::reactive({ - if(is.null(covariateSummary())) - return(NULL) - plotCovariateSummary(formatCovariateTable(covariateSummary())) - }) - - output$covariateSummaryBinary <- plotly::renderPlotly({ covs()$binary }) - output$covariateSummaryMeasure <- plotly::renderPlotly({ covs()$meas }) - - # Downloadable csv of model ---- - output$downloadData <- shiny::downloadHandler( - filename = function(){'model.csv'}, - content = function(file) { - utils::write.csv( - covariateSummary()[,colnames(covariateSummary()) %in% c('covariateName','covariateValue','CovariateCount','WithOutcomeCovariateMean','WithNoOutcomeCovariateMean','WithOutcome_CovariateMean','WithNoOutcome_CovariateMean' )], - file, - row.names = FALSE - ) - } - ) - - } - ) -} - - -# helpers - -# format covariate summary table -formatCovariateTable <- function(covariateSummary){ - covariateSummary <- as.data.frame(covariateSummary) - colnames(covariateSummary) <- gsub('_','', colnames(covariateSummary) ) - for(coln in c('covariateValue','WithOutcomeCovariateMean','WithNoOutcomeCovariateMean','StandardizedMeanDiff')){ - if(sum(colnames(covariateSummary)==coln)>0){ - covariateSummary[,coln] <- format(round(covariateSummary[,coln], 4), nsmall = 4) - class(covariateSummary[,coln]) <- "numeric" - } - } - return(covariateSummary) -} - - - -editCovariates <- function(covs){ - colnames(covs) <- gsub('_','', colnames(covs) ) - if(!is.null(covs$StandardizedMeanDiff)){ - return(list(table = formatCovariateTable(covs[,c('covariateName','covariateValue','CovariateCount','WithOutcomeCovariateMean','WithNoOutcomeCovariateMean','StandardizedMeanDiff')]), - colnames = c('Covariate Name', 'Value','Count', 'Outcome Mean', 'Non-outcome Mean','Std Mean Diff') - )) - } else{ - return(list(table = formatCovariateTable(covs[,c('covariateName','covariateValue','CovariateCount','WithOutcomeCovariateMean','WithNoOutcomeCovariateMean')]), - colnames = c('Covariate Name', 'Value','Count', 'Outcome Mean', 'Non-outcome Mean') - )) - } -} - - - -plotCovariateSummary <- function(covariateSummary){ - - colnames(covariateSummary) <- gsub('_','', colnames(covariateSummary) ) - - #writeLines(paste(colnames(covariateSummary))) - #writeLines(paste(covariateSummary[1,])) - # remove na values - covariateSummary$WithNoOutcomeCovariateMean[is.na(covariateSummary$WithNoOutcomeCovariateMean)] <- 0 - covariateSummary$WithOutcomeCovariateMean[is.na(covariateSummary$WithOutcomeCovariateMean)] <- 0 - if(!'covariateValue'%in%colnames(covariateSummary)){ - covariateSummary$covariateValue <- 1 - } - if(sum(is.na(covariateSummary$covariateValue))>0){ - covariateSummary$covariateValue[is.na(covariateSummary$covariateValue)] <- 0 - } - - # SPEED EDIT remove the none model variables - covariateSummary <- covariateSummary[covariateSummary$covariateValue!=0,] - - # save dots based on coef value - covariateSummary$size <- abs(covariateSummary$covariateValue) - covariateSummary$size[is.na(covariateSummary$size)] <- 4 - covariateSummary$size <- 4+4*covariateSummary$size/max(covariateSummary$size) - - # color based on analysis id - covariateSummary$color <- sapply(covariateSummary$covariateName, function(x) ifelse(is.na(x), '', strsplit(as.character(x), ' ')[[1]][1])) - - covariateSummary$times <- sapply(sapply(covariateSummary$covariateName, function(x) ifelse(is.na(x), '', strsplit(as.character(x), 'during day ')[[1]][2])),function(x) ifelse(is.na(x), '', strsplit(as.character(x), ': ')[[1]][1])) - covariateSummary$desc <- sapply(covariateSummary$covariateName, function(x) ifelse(is.na(x), '', strsplit(as.character(x), ': ')[[1]][2])) - - - l <- list(x = 0.01, y = 1, - font = list( - family = "sans-serif", - size = 10, - color = "#000"), - bgcolor = "#E2E2E2", - bordercolor = "#FFFFFF", - borderwidth = 1) - - ind <- covariateSummary$WithNoOutcomeCovariateMean <=1 & covariateSummary$WithOutcomeCovariateMean <= 1 - # create two plots -1 or less or g1 - binary <- plotly::plot_ly(x = covariateSummary$WithNoOutcomeCovariateMean[ind], - #size = covariateSummary$size[ind], - showlegend = F) %>% - plotly::add_markers(y = covariateSummary$WithOutcomeCovariateMean[ind], - color=factor(covariateSummary$color[ind]), - hoverinfo = 'text', - text = ~paste('
Type: ', covariateSummary$color[ind], - '
Time: ', covariateSummary$times[ind], - '
Name: ', covariateSummary$desc[ind]), - showlegend = T - ) %>% - plotly::add_trace(x= c(0,1), y = c(0,1),mode = 'lines', - line = list(dash = "dash"), color = I('black'), - type='scatter', showlegend = FALSE) %>% - plotly::layout(#title = 'Prevalance of baseline predictors in persons with and without outcome', - xaxis = list(title = "Prevalance in persons without outcome", - range = c(0, 1)), - yaxis = list(title = "Prevalance in persons with outcome", - range = c(0, 1)), - #legend = l, showlegend = T, - legend = list(orientation = 'h', y = -0.3), showlegend = T) - - if(sum(!ind)>0){ - maxValue <- max(c(covariateSummary$WithNoOutcomeCovariateMean[!ind], - covariateSummary$WithOutcomeCovariateMean[!ind]), na.rm = T) - meas <- plotly::plot_ly(x = covariateSummary$WithNoOutcomeCovariateMean[!ind] ) %>% - plotly::add_markers(y = covariateSummary$WithOutcomeCovariateMean[!ind], - hoverinfo = 'text', - text = ~paste('
Type: ', covariateSummary$color[!ind], - '
Time: ', covariateSummary$times[!ind], - '
Name: ', covariateSummary$desc[!ind])) %>% - plotly::add_trace(x= c(0,maxValue), y = c(0,maxValue),mode = 'lines', - line = list(dash = "dash"), color = I('black'), - type='scatter', showlegend = FALSE) %>% - plotly::layout(#title = 'Prevalance of baseline predictors in persons with and without outcome', - xaxis = list(title = "Mean in persons without outcome"), - yaxis = list(title = "Mean in persons with outcome"), - showlegend = FALSE) - } else { - meas <- NULL - } - - return(list(binary=binary, - meas = meas)) -} - - - -getIntercept <- function(plpResult){ - - if('model'%in%names(plpResult)){ - - if('model'%in%names(plpResult$model)){ - - if('coefficients'%in%names(plpResult$model$model)){ - - return(plpResult$model$model$coefficients[1]) - - } - - } - } - return(0) -} - - -toFirstUpper <- function(vec){ - res <- lapply(vec, function(x) paste0(toupper(substr(x,1,1)), substr(x,2,nchar(x)))) - return(unlist(res)) -} - -# code for database covariate extract -loadCovSumFromDb <- function(chosenRow, mySchema, con, myTableAppend = '', targetDialect = 'redshift'){ - ParallelLogger::logInfo("starting covsum") - resultId <- chosenRow$resultId - sql <- "SELECT * FROM @my_schema.@my_table_appendcovariate_summary AS covariate_summary WHERE result_id = @result_id;" - - sql <- SqlRender::render(sql = sql, - my_schema = mySchema, - result_id = resultId, - my_table_append = myTableAppend) - sql <- SqlRender::translate(sql = sql, targetDialect = targetDialect) - - covariateSummary <- DatabaseConnector::dbGetQuery(conn = con, statement = sql) - colnames(covariateSummary) <- SqlRender::snakeCaseToCamelCase(colnames(covariateSummary)) - - # capitalize the first letter of column names - # except covariateId covariateName analysisId conceptId - ind <- !colnames(covariateSummary) %in% c('covariateValue','covariateId', 'covariateName', 'analysisId', 'conceptId') - colnames(covariateSummary)[ind] <- toFirstUpper(colnames(covariateSummary)[ind]) - - ParallelLogger::logInfo("finishing covsum") - return(covariateSummary) -} diff --git a/inst/shiny/PLPViewer/modules/cutoff.R b/inst/shiny/PLPViewer/modules/cutoff.R deleted file mode 100644 index 2f1587829..000000000 --- a/inst/shiny/PLPViewer/modules/cutoff.R +++ /dev/null @@ -1,280 +0,0 @@ -cutoffViewer <- function(id) { - - ns <- shiny::NS(id) - - shiny::fluidRow( - - shiny::column(width = 12, - - shinydashboard::box( - width = 12, - title = "Probability threshold plot: ", - status = "info", - solidHeader = TRUE, - plotly::plotlyOutput(ns("ptp")) - ), - - shinydashboard::box( - width = 12, - title = "Cutoff Slider: ", - status = "info", - solidHeader = TRUE, - shiny::sliderInput( - ns("slider1"), - shiny::span( - "Pick Threshold ", - shiny::textOutput('threshold'), - style="font-family: Arial;font-size:14px;" - ), - min = 1, - max = 100, - value = 50, - ticks = F - ) - ), - - shinydashboard::box( - width = 12, - title = "Dashboard", - status = "warning", solidHeader = TRUE, - shinydashboard::infoBoxOutput(ns("performanceBoxThreshold")), - shinydashboard::infoBoxOutput(ns("performanceBoxIncidence")), - shinydashboard::infoBoxOutput(ns("performanceBoxPPV")), - shinydashboard::infoBoxOutput(ns("performanceBoxSpecificity")), - shinydashboard::infoBoxOutput(ns("performanceBoxSensitivity")), - shinydashboard::infoBoxOutput(ns("performanceBoxNPV") - ) - ), - - shinydashboard::box( - width = 12, - title = "Cutoff Performance", - status = "warning", solidHeader = TRUE, - shiny::tableOutput(ns('twobytwo')) - ) - ) - ) -} - -cutoffServer <- function(id, plpResult) { - shiny::moduleServer( - id, - function(input, output, session) { - - performance <- shiny::reactive({ - eval <- plpResult()$performanceEvaluation - if(is.null(eval)){ - return(NULL) - } else { - intPlot <- getORC(eval, input$slider1) - threshold <- intPlot$threshold - prefthreshold <- intPlot$prefthreshold - TP <- intPlot$TP - FP <- intPlot$FP - TN <- intPlot$TN - FN <- intPlot$FN - } - - twobytwo <- as.data.frame(matrix(c(FP,TP,TN,FN), byrow=T, ncol=2)) - colnames(twobytwo) <- c('Ground Truth Negative','Ground Truth Positive') - rownames(twobytwo) <- c('Predicted Positive','Predicted Negative') - - list(threshold = threshold, - prefthreshold = prefthreshold, - twobytwo = twobytwo, - Incidence = (TP+FN)/(TP+TN+FP+FN), - Threshold = threshold, - Sensitivity = TP/(TP+FN), - Specificity = TN/(TN+FP), - PPV = TP/(TP+FP), - NPV = TN/(TN+FN) ) - }) - - # add probability threshold plot - ptp <- shiny::reactive({ - eval <- plpResult()$performanceEvaluation - if(is.null(eval)){ - return(NULL) - } else { - probThresPlot(eval = eval, pointOfInterest = input$slider1) - } - - }) - - output$ptp <- plotly::renderPlotly(ptp()) - - - - - # update threshold slider based on results size - shiny::observe({ - if(!is.null(plpResult()$performanceEvaluation)){ - n <- nrow(plpResult()$performanceEvaluation$thresholdSummary[plpResult()$performanceEvaluation$thresholdSummary$evaluation%in%c('Test','Validation'),]) - }else{ - n <- 100 - } - - shiny::updateSliderInput(session, inputId = "slider1", - min = 1, max = n, value = round(n/2)) - }) - - # Do the tables and plots: - - output$performance <- shiny::renderTable(performance()$performance, - rownames = F, digits = 3) - output$twobytwo <- shiny::renderTable(performance()$twobytwo, - rownames = T, digits = 0) - - - output$threshold <- shiny::renderText(format(performance()$threshold,digits=5)) - - - # dashboard - - output$performanceBoxIncidence <- shinydashboard::renderInfoBox({ - shinydashboard::infoBox( - "Incidence", paste0(round(performance()$Incidence*100, digits=3),'%'), icon = shiny::icon("ambulance"), - color = "green" - ) - }) - - output$performanceBoxThreshold <- shinydashboard::renderInfoBox({ - shinydashboard::infoBox( - "Threshold", format((performance()$Threshold), scientific = F, digits=3), icon = shiny::icon("edit"), - color = "yellow" - ) - }) - - output$performanceBoxPPV <- shinydashboard::renderInfoBox({ - shinydashboard::infoBox( - "PPV", paste0(round(performance()$PPV*1000)/10, "%"), icon = shiny::icon("thumbs-up"), - color = "orange" - ) - }) - - output$performanceBoxSpecificity <- shinydashboard::renderInfoBox({ - shinydashboard::infoBox( - "Specificity", paste0(round(performance()$Specificity*1000)/10, "%"), icon = shiny::icon("bullseye"), - color = "purple" - ) - }) - - output$performanceBoxSensitivity <- shinydashboard::renderInfoBox({ - shinydashboard::infoBox( - "Sensitivity", paste0(round(performance()$Sensitivity*1000)/10, "%"), icon = shiny::icon("low-vision"), - color = "blue" - ) - }) - - output$performanceBoxNPV <- shinydashboard::renderInfoBox({ - shinydashboard::infoBox( - "NPV", paste0(round(performance()$NPV*1000)/10, "%"), icon = shiny::icon("minus-square"), - color = "black" - ) - }) - - - } - ) -} - - - -getORC <- function(eval, pointOfInterest){ - - data <- eval$thresholdSummary[eval$thresholdSummary$evaluation%in%c('Test','Validation'),] - data <- data[order(data$predictionThreshold),] - pointOfInterest <- data[pointOfInterest,] - - threshold <- pointOfInterest$predictionThreshold - TP <- pointOfInterest$truePositiveCount - TN <- pointOfInterest$trueNegativeCount - FP <- pointOfInterest$falsePositiveCount - FN <- pointOfInterest$falseNegativeCount - preferenceThreshold <- pointOfInterest$preferenceThreshold - return(list(threshold = threshold, prefthreshold=preferenceThreshold, - TP = TP, TN=TN, - FP= FP, FN=FN)) -} - -probThresPlot <- function(eval, pointOfInterest){ - - eval <- eval$thresholdSummary[eval$thresholdSummary$evaluation%in%c('Test','Validation'),] - eval <- eval[order(eval$predictionThreshold),] - - ay <- list( - tickfont = list(color = "red"), - overlaying = "y", - side = "right", - title = "positivePredictiveValue y-axis" - ) - vline <- function(x = 0, color = "green") { - list( - type = "line", - y0 = 0, - y1 = 1, - yref = "paper", - x0 = x, - x1 = x, - line = list(color = color, dash="dot") - ) - } - - eval$popfrac <- eval$positiveCount/(eval$positiveCount+eval$negativeCount) - - fig <- plotly::plot_ly( - data = eval, - x = ~ predictionThreshold, - y = ~ sensitivity, - name = 'sensitivity', - color = 'blue', - type = 'scatter', - mode = 'lines' - ) %>% - plotly::add_trace( - yaxis = "y2", - y = ~ positivePredictiveValue, - name = 'positivePredictiveValue', - color = 'red', - mode = 'lines' - ) %>% - plotly::add_trace( - y = ~ negativePredictiveValue, - name = 'negativePredictiveValue', - color = 'green', - mode = 'lines' - ) %>% - plotly::add_trace( - y = ~ popfrac, - name = 'Fraction flagged', - color = 'black', - mode = 'lines' - ) %>% - plotly::layout( - title = "Probability Threshold Plot", - yaxis2 = ay, - #xaxis = list(title="Prediction Threshold"), - #yaxis = list(title="Metric yaxis") - #) %>% - #plotly::layout( - plot_bgcolor='#e5ecf6', - xaxis = list( - title = "Prediction Threshold", - zerolinecolor = '#ffff', - zerolinewidth = 2, - gridcolor = 'ffff' - ), - yaxis = list( - title = "Metric yaxis", - zerolinecolor = '#ffff', - zerolinewidth = 2, - gridcolor = 'ffff' - ), - shapes = list(vline(eval$predictionThreshold[pointOfInterest])) - ) - - return(fig) - -} - - diff --git a/inst/shiny/PLPViewer/modules/discrimination.R b/inst/shiny/PLPViewer/modules/discrimination.R deleted file mode 100644 index ce854ff0e..000000000 --- a/inst/shiny/PLPViewer/modules/discrimination.R +++ /dev/null @@ -1,510 +0,0 @@ -discriminationViewer <- function(id) { - - ns <- shiny::NS(id) - - shiny::div( - - # summary table - shiny::fluidRow( - shinydashboard::box( - status = 'info', - width = 12, - title = 'Summary', - solidHeader = TRUE, - shiny::p('Click on one of these rows to view corresponding plots:'), - DT::dataTableOutput(ns('summaryTable')) - ) - ), - - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("rocHelp"), - "ROC Plot", - icon = icon("info") - ), - solidHeader = TRUE, - shinycssloaders::withSpinner( - plotly::plotlyOutput(ns('roc')) - ) - ), - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("prcHelp"), - "Precision recall plot", - icon = icon("info") - ), - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner( - plotly::plotlyOutput(ns('pr')) - ) - ) - ), - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("f1Help"), - "F1 Score Plot", - icon = icon("info") - ), - solidHeader = TRUE, - shinycssloaders::withSpinner( - plotly::plotlyOutput(ns('f1')) - ) - ), - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("boxHelp"), - "Box Plot", - icon = icon("info") - ), - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner( - shiny::plotOutput(ns('box')) - ) - ) - ), - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("predDistHelp"), - "Prediction Score Distribution", - icon = icon("info") - ), - solidHeader = TRUE, - shinycssloaders::withSpinner( - shiny::plotOutput(ns('preddist')) - ) - ), - shinydashboard::box( - status = 'info', - title = shiny::actionLink( - ns("prefDistHelp"), - "Preference Score Distribution", - icon = icon("info") - ), - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner( - shiny::plotOutput(ns('prefdist')) - ) - ) - ) - ) -} - -discriminationServer <- function(id, plpResult) { - shiny::moduleServer( - id, - function(input, output, session) { - - sumTable <- shiny::reactive({ - data <- plpResult()$performanceEvaluation$evaluationStatistics - for(i in 1:ncol(data)){ - data[,i] <- unlist(data[,i]) - } - data$value <- as.double(as.character(data$value)) - data$value <- format(data$value, digits = 4, scientific = F) - ind <- data$metric %in% c('AUROC', - '95% lower AUROC', - '95% upper AUROC', - 'AUPRC' - - ) - - tidyr::pivot_wider( - data = data[ind,], - names_from = 'metric', - values_from = 'value' - ) - #reshape2::dcast(data[ind,], evaluation ~ metric, value.var = 'value') - - }) - - - output$summaryTable <- DT::renderDataTable({ - if(is.null(plpResult()$performanceEvaluation)){ - DT::datatable(NULL) - } else{ - result <- sumTable() - row.names(result) <- NULL - DT::datatable(result,selection = 'single') - } - }) - - - plots <- shiny::reactive({ - - result <- list(roc = tryCatch({rocPlot(eval = plpResult()$performanceEvaluation)}, - error = function(cond){ - list(train = emptyPlot(title = 'No performanceEvaluation')) - }), - pr = tryCatch({prPlot(eval = plpResult()$performanceEvaluation)}, - error = function(cond){ - list(train = emptyPlot(title = 'No performanceEvaluation')) - }), - f1 = tryCatch({f1Plot(eval = plpResult()$performanceEvaluation)}, - error = function(cond){ - list(train = emptyPlot(title = 'No performanceEvaluation')) - }), - prefpdf = tryCatch({plotPreferencePDF(plpResult()$performanceEvaluation)}, - error = function(cond){ - NULL - }), - predpdf = tryCatch({plotPredictedPDF(plpResult()$performanceEvaluation)}, - error = function(cond){ - NULL - }), - box = tryCatch({plotPredictionDistribution(plpResult()$performanceEvaluation)}, - error = function(cond){ - NULL - }) - ) - - return(result) - } - ) - - output$roc <- plotly::renderPlotly({ - type <- trimws(sumTable()$evaluation[input$summaryTable_rows_selected]) - tryCatch({plots()$roc[[type]]}, error = function(err){emptyPlot(title = err)}) - }) - - output$pr <- plotly::renderPlotly({ - type <- trimws(sumTable()$evaluation[input$summaryTable_rows_selected]) - tryCatch({plots()$pr[[type]]}, error = function(err){emptyPlot(title = err)}) - }) - - output$f1 <- plotly::renderPlotly({ - type <- trimws(sumTable()$evaluation[input$summaryTable_rows_selected]) - tryCatch({plots()$f1[[type]]}, error = function(err){emptyPlot(title = err)}) - }) - - # preference plot - output$prefdist <- shiny::renderPlot({ - type <- trimws(sumTable()$evaluation[input$summaryTable_rows_selected]) - tryCatch({plots()$prefpdf[[type]]}, error = function(err){emptyPlot(title = err)}) - }) - - output$preddist <- shiny::renderPlot({ - type <- trimws(sumTable()$evaluation[input$summaryTable_rows_selected]) - tryCatch({plots()$predpdf[[type]]}, error = function(err){emptyPlot(title = err)}) - }) - - output$box <- shiny::renderPlot({ - type <- trimws(sumTable()$evaluation[input$summaryTable_rows_selected]) - tryCatch({plots()$box[[type]]}, error = function(err){emptyPlot(title = err)}) - }) - - shiny::observeEvent(input$rocHelp, { - showInfoBox("ROC Help", "html/rocHelp.html") - }) - shiny::observeEvent(input$prcHelp, { - showInfoBox("PRC Help", "html/prcHelp.html") - }) - shiny::observeEvent(input$f1Help, { - showInfoBox("F1 Score Plot Help", "html/f1Help.html") - }) - shiny::observeEvent(input$boxHelp, { - showInfoBox("Box Plot Help", "html/boxHelp.html") - }) - shiny::observeEvent(input$predDistHelp, { - showInfoBox("Predicted Risk Distribution Help", "html/predDistHelp.html") - }) - shiny::observeEvent(input$prefDistHelp, { - showInfoBox("Preference Score Distribution Help", "html/prefDistHelp.html") - }) - - - - } - ) -} - - - -# pltting -rocPlot <- function(eval, type){ - - types <- unique(eval$thresholdSummary$evaluation) - rocobject <- list() - length(rocobject) <- length(types) - names(rocobject) <- types - - for(type in types){ - data <- eval$thresholdSummary[eval$thresholdSummary$evaluation%in%type,] - - rocobject[[type]] <- plotly::plot_ly(x = 1-c(0,data$specificity,1)) %>% - plotly::add_lines(y = c(1,data$sensitivity,0),name = "hv", - text = paste('Risk Threshold:',c(0,data$predictionThreshold,1)), - line = list(shape = "hv", - color = 'rgb(22, 96, 167)'), - fill = 'tozeroy') %>% - plotly::add_trace(x= c(0,1), y = c(0,1),mode = 'lines', - line = list(dash = "dash"), color = I('black'), - type='scatter') %>% - plotly::layout(title = "ROC Plot", - xaxis = list(title = "1-specificity"), - yaxis = list (title = "Sensitivity"), - showlegend = FALSE) - } - return(rocobject) -} - -prPlot <- function(eval, type){ - types <- unique(eval$thresholdSummary$evaluation) - probject <- list() - length(probject) <- length(types) - names(probject) <- types - - for(type in types){ - data <- eval$thresholdSummary[eval$thresholdSummary$evaluation%in%type,] - - popAv <- data$trueCount[1]/(data$trueCount[1] + data$falseCount[1]) - probject[[type]] <- plotly::plot_ly(x = data$sensitivity) %>% - plotly::add_lines(y = data$positivePredictiveValue, name = "hv", - text = paste('Risk Threshold:',data$predictionThreshold), - line = list(shape = "hv", - color = 'rgb(22, 96, 167)'), - fill = 'tozeroy') %>% - plotly::add_trace(x= c(0,1), y = c(popAv,popAv),mode = 'lines', - line = list(dash = "dash"), color = I('red'), - type='scatter') %>% - plotly::layout(title = "PR Plot", - xaxis = list(title = "Recall"), - yaxis = list (title = "Precision"), - showlegend = FALSE) - - } - return(probject) -} - -f1Plot <- function(eval, type){ - types <- unique(eval$thresholdSummary$evaluation) - f1object <- list() - length(f1object) <- length(types) - names(f1object) <- types - - for(type in types){ - data <- eval$thresholdSummary[eval$thresholdSummary$evaluation%in%type,] - - f1object[[type]] <- plotly::plot_ly(x = data$predictionThreshold) %>% - plotly::add_lines(y = data$f1Score, name = "hv", - text = paste('Risk Threshold:',data$predictionThreshold), - line = list(shape = "hv", - color = 'rgb(22, 96, 167)'), - fill = 'tozeroy') %>% - plotly::layout(title = "F1-Score Plot", - xaxis = list(title = "Prediction Threshold"), - yaxis = list (title = "F1-Score"), - showlegend = FALSE) - - } - return(f1object) -} - - - - - - -# adding plots from PLP temporarily as shiny deploy doesnt have PatientLevelPrediction - -plotPredictedPDF <- function(evaluation, fileName=NULL){ - - if(!is.null(evaluation$thresholdSummary$evaluation)){ - types <- unique(evaluation$thresholdSummary$evaluation) - } else{ - evaluation$thresholdSummary$evaluation <- 'na' - types <- 'na' - } - - plotResult <- list() - length(plotResult) <- length(types) - names(plotResult) <- types - - for(type in types){ - - ind <- 1:nrow(evaluation$thresholdSummary) - if(!is.null(evaluation$thresholdSummary$evaluation)){ - ind <- evaluation$thresholdSummary$evaluation == type - } - - - x<- evaluation$thresholdSummary[ind,c('predictionThreshold','truePositiveCount','trueNegativeCount', - 'falsePositiveCount','falseNegativeCount')] - x<- x[order(x$predictionThreshold,-x$truePositiveCount, -x$falsePositiveCount),] - x$out <- c(x$truePositiveCount[-length(x$truePositiveCount)]-x$truePositiveCount[-1], x$truePositiveCount[length(x$truePositiveCount)]) - x$nout <- c(x$falsePositiveCount[-length(x$falsePositiveCount)]-x$falsePositiveCount[-1], x$falsePositiveCount[length(x$falsePositiveCount)]) - - vals <- c() - for(i in 1:length(x$predictionThreshold)){ - if(i!=length(x$predictionThreshold)){ - upper <- x$predictionThreshold[i+1]} else {upper <- min(x$predictionThreshold[i]+0.01,1)} - val <- x$predictionThreshold[i]+runif(x$out[i])*(upper-x$predictionThreshold[i]) - vals <- c(val, vals) - } - vals[!is.na(vals)] - - vals2 <- c() - for(i in 1:length(x$predictionThreshold)){ - if(i!=length(x$predictionThreshold)){ - upper <- x$predictionThreshold[i+1]} else {upper <- min(x$predictionThreshold[i]+0.01,1)} - val2 <- x$predictionThreshold[i]+runif(x$nout[i])*(upper-x$predictionThreshold[i]) - vals2 <- c(val2, vals2) - } - vals2[!is.na(vals2)] - - x <- rbind(data.frame(variable=rep('outcome',length(vals)), value=vals), - data.frame(variable=rep('No outcome',length(vals2)), value=vals2) - ) - - plot <- ggplot2::ggplot(x, ggplot2::aes(x=value, - group=variable, - fill=variable)) + - ggplot2::geom_density(ggplot2::aes(x=value, fill=variable), alpha=.3) + - ggplot2::scale_x_continuous("Prediction Threshold")+#, limits=c(0,1)) + - ggplot2::scale_y_continuous("Density") + - ggplot2::guides(fill=ggplot2::guide_legend(title="Class")) - - plotResult[[type]] <- plot -} -return(plotResult) -} - - - - -plotPreferencePDF <- function(evaluation, fileName=NULL){ - - if(!is.null(evaluation$thresholdSummary$evaluation)){ - types <- unique(evaluation$thresholdSummary$evaluation) - } else{ - evaluation$thresholdSummary$evaluation <- 'na' - types <- 'na' - } - - plotResult <- list() - length(plotResult) <- length(types) - names(plotResult) <- types - - for(type in types){ - - ind <- 1:nrow(evaluation$thresholdSummary) - if(!is.null(evaluation$thresholdSummary$evaluation)){ - ind <- evaluation$thresholdSummary$evaluation == type - } - - x <- evaluation$thresholdSummary[ind,c('preferenceThreshold','truePositiveCount','trueNegativeCount', - 'falsePositiveCount','falseNegativeCount')] - x <- x[order(x$preferenceThreshold,-x$truePositiveCount, x$trueNegativeCount),] - x$out <- c(x$truePositiveCount[-length(x$truePositiveCount)]-x$truePositiveCount[-1], x$truePositiveCount[length(x$truePositiveCount)]) - x$nout <- c(x$falsePositiveCount[-length(x$falsePositiveCount)]-x$falsePositiveCount[-1], x$falsePositiveCount[length(x$falsePositiveCount)]) - - vals <- c() - for(i in 1:length(x$preferenceThreshold)){ - if(i!=length(x$preferenceThreshold)){ - upper <- x$preferenceThreshold[i+1]} else {upper <- 1} - val <- x$preferenceThreshold[i]+runif(x$out[i])*(upper-x$preferenceThreshold[i]) - vals <- c(val, vals) - } - vals[!is.na(vals)] - - vals2 <- c() - for(i in 1:length(x$preferenceThreshold)){ - if(i!=length(x$preferenceThreshold)){ - upper <- x$preferenceThreshold[i+1]} else {upper <- 1} - val2 <- x$preferenceThreshold[i]+runif(x$nout[i])*(upper-x$preferenceThreshold[i]) - vals2 <- c(val2, vals2) - } - vals2[!is.na(vals2)] - - x <- rbind(data.frame(variable=rep('outcome',length(vals)), value=vals), - data.frame(variable=rep('No outcome',length(vals2)), value=vals2) - ) - - plot <- ggplot2::ggplot(x, ggplot2::aes(x=value, - group=variable, - fill=variable)) + - ggplot2::geom_density(ggplot2::aes(x=value, fill=variable), alpha=.3) + - ggplot2::scale_x_continuous("Preference Threshold")+#, limits=c(0,1)) + - ggplot2::scale_y_continuous("Density") + - ggplot2::guides(fill=ggplot2::guide_legend(title="Class")) - - plotResult[[type]] <- plot - -} - -return(plotResult) -} - -plotPredictionDistribution <- function(evaluation){ - - if(!is.null(evaluation$predictionDistribution$evaluation)){ - types <- unique(evaluation$predictionDistribution$evaluation) - } else{ - evaluation$predictionDistribution$evaluation <- 'na' - types <- 'na' - } - - plotResult <- list() - length(plotResult) <- length(types) - names(plotResult) <- types - - for(type in types){ - - ind <- 1:nrow(evaluation$predictionDistribution) - if(!is.null(evaluation$predictionDistribution$evaluation)){ - ind <- evaluation$predictionDistribution$evaluation == type - } - x<- evaluation$predictionDistribution[ind,] - - #(x=Class, y=predictedProbabllity sequence: min->P05->P25->Median->P75->P95->max) - - non05 <- x$P05PredictedProbability[x$class==0] - non95 <- x$P95PredictedProbability[x$class==0] - one05 <- x$P05PredictedProbability[x$class==1] - one95 <- x$P95PredictedProbability[x$class==1] - - plot <- ggplot2::ggplot(x, - ggplot2::aes( - x=as.factor(class), - ymin=MinPredictedProbability, - lower=P25PredictedProbability, - middle=MedianPredictedProbability, - upper=P75PredictedProbability, - ymax=MaxPredictedProbability, - color=as.factor(class) - ) - ) + - ggplot2::coord_flip() + - ggplot2::geom_boxplot(stat="identity") + - ggplot2::scale_x_discrete("Class") + - ggplot2::scale_y_continuous("Predicted Probability") + - ggplot2::theme(legend.position="none") + - ggplot2::geom_segment(ggplot2::aes(x = 0.9, y = non05, - xend = 1.1, yend = non05), color='red') + - ggplot2::geom_segment(ggplot2::aes(x = 0.9, y = non95, - xend = 1.1, yend = non95), color='red') + - ggplot2::geom_segment(ggplot2::aes(x = 1.9, y = one05, - xend = 2.1, yend = one05)) + - ggplot2::geom_segment(ggplot2::aes(x = 1.9, y = one95, - xend = 2.1, yend = one95)) - - plotResult[[type]] <- plot - - } - - return(plotResult) -} - diff --git a/inst/shiny/PLPViewer/modules/download.R b/inst/shiny/PLPViewer/modules/download.R deleted file mode 100644 index 56f352895..000000000 --- a/inst/shiny/PLPViewer/modules/download.R +++ /dev/null @@ -1,122 +0,0 @@ -downloadViewer <- function(id) { - - ns <- shiny::NS(id) - - shiny::div( - - shinydashboard::box( - title = "Development R Package", - status = 'info', - solidHeader = T, - shiny::p("Click here to download an R package that contains all the settings requires to replicate the model development using any OMOP CDM database."), - shiny::actionButton( - inputId = ns('downloadPackageDev'), - label = "Download Development" - ) - ), - - shinydashboard::box( - title = "Validation R Package", - status = 'info', - solidHeader = T, - shiny::p("Click here to download an R package that contains all the settings requires to validate the existing model using any OMOP CDM database."), - shiny::actionButton( - inputId = ns('downloadPackageVal'), - label = "Download Validation" - ) - - ) - ) -} - -downloadServer <- function(id) { - shiny::moduleServer( - id, - function(input, output, session) { - - shiny::observeEvent( - input$downloadPackageDev, - { - dir.create(file.path('/Users/jreps/Downloads', 'devPackage'), recursive = T) - #Hydra::hydrate(specifications = specifications, outputFolder = outputPackageLocation) - createPackage <- tryCatch( - {downLoadSkeleton( - outputFolder = file.path('/Users/jreps/Downloads'), - packageName = 'devPackage', - skeletonType = 'SkeletonPredictionStudy' - )#'SkeletonPredictionValidationStudy' - }, error = function(e){return(NULL)} - ) - - if(!is.null(createPackage)){ - createPackage <- tryCatch( - {replaceName( - packageLocation = file.path('/Users/jreps/Downloads', 'devPackage'), - packageName = 'devPackage', - skeletonType = 'SkeletonPredictionStudy') - }, - error = function(e){return(NULL)} - ) - } - - - }) - - } - ) -} - -### DOWNLOAD - -downLoadSkeleton <- function( - outputFolder, - packageName, - skeletonType = 'SkeletonPredictionStudy' -){ - utils::download.file( - url = paste0("https://github.com/ohdsi/",skeletonType,"/archive/master.zip"), - destfile = file.path(outputFolder, "package.zip") - ) - # unzip the .zip file - utils::unzip( - zipfile = file.path(outputFolder, "package.zip"), - exdir = outputFolder - ) - file.rename( - from = file.path(outputFolder, paste0(skeletonType, '-master')), - to = file.path(outputFolder, packageName) - ) - unlink(file.path(outputFolder, "package.zip")) - return(file.path(outputFolder, packageName)) -} - -# change name -replaceName <- function( - packageLocation = getwd(), - packageName = 'ValidateRCRI', - skeletonType = 'SkeletonPredictionValidationStudy' -){ - - filesToRename <- c(paste0(skeletonType,".Rproj"),paste0("R/",skeletonType,".R")) - for(f in filesToRename){ - ParallelLogger::logInfo(paste0('Renaming ', f)) - fnew <- gsub(skeletonType, packageName, f) - file.rename(from = file.path(packageLocation,f), to = file.path(packageLocation,fnew)) - } - - filesToEdit <- c( - file.path(packageLocation,"DESCRIPTION"), - file.path(packageLocation,"README.md"), - file.path(packageLocation,"extras/CodeToRun.R"), - dir(file.path(packageLocation,"R"), full.names = T) - ) - for( f in filesToEdit ){ - ParallelLogger::logInfo(paste0('Editing ', f)) - x <- readLines(f) - y <- gsub( skeletonType, packageName, x ) - cat(y, file=f, sep="\n") - - } - - return(packageName) -} diff --git a/inst/shiny/PLPViewer/modules/netBenefit.R b/inst/shiny/PLPViewer/modules/netBenefit.R deleted file mode 100644 index 9ad7cc132..000000000 --- a/inst/shiny/PLPViewer/modules/netBenefit.R +++ /dev/null @@ -1,132 +0,0 @@ -nbViewer <- function(id) { - ns <- shiny::NS(id) - - shiny::div( - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - width = 12, - title = 'Select net benefit type to view:', - solidHeader = TRUE, - shiny::uiOutput(ns('nbSelect')) - ) - ), - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - width = 6, - title = 'Net Benefit Plot', - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner( - shiny::plotOutput(ns('nbPlot')) - ) - ), - - shinydashboard::box( - status = 'info', - width = 6, - title = 'Summary', - solidHeader = TRUE, - DT::dataTableOutput(ns('nbTable')) - ) - ) - ) -} - -nbServer <- function(id, plpResult) { - shiny::moduleServer( - id, - function(input, output, session) { - - output$nbSelect = shiny::renderUI({ - shiny::selectInput( - inputId = session$ns('nbSelectInput'), - label = 'Type:', - choices = unique(plpResult()$performanceEvaluation$thresholdSummary$evaluation), - multiple = F, - selectize=FALSE - ) - }) - - output$nbTable <- DT::renderDataTable({ - if(is.null(plpResult()$performanceEvaluation)){ - return(NULL) - } else{ - result <- extractNetBenefit( - performanceEvaluation = plpResult()$performanceEvaluation, - type=trimws(input$nbSelectInput) - ) - unique(result) - result$treatAll <- format(result$treatAll, digits = 2, scientific = F) - result$netBenefit <- format(result$netBenefit, digits = 2, scientific = F) - result - } - }) - - output$nbPlot <- shiny::renderPlot({ - if(is.null(plpResult()$performanceEvaluation)){ - return(NULL) - } else{ - result <- extractNetBenefit( - performanceEvaluation = plpResult()$performanceEvaluation, - type=trimws(input$nbSelectInput) - ) - result <- unique(result) - ind <- !is.na(result$netBenefit) & is.finite(result$netBenefit) & !is.null(result$netBenefit) & is.finite(result$pt) - - df2 <- tidyr::pivot_longer( - data = result, - cols = colnames(result)[colnames(result) != 'pt'], - names_to = 'variable', - values_to = 'value' - ) - - - ggplot2::ggplot( - df2, - ggplot2::aes(x=pt, - y=value, - group=variable, - color = variable - ) - ) + - ggplot2::geom_line(ggplot2::aes(linetype=variable))+ - ggplot2::geom_point(ggplot2::aes(shape=variable)) - } - }) - - } - ) -} - - - -extractNetBenefit <- function(performanceEvaluation, type=NULL, modelId=NULL){ - data <- performanceEvaluation$thresholdSummary - - if(!is.null(type)){ - if(!is.null(data$evaluation[1])){ - data <- data[data$evaluation==type,] - } - } - - pt <- data$predictionThreshold - TP <- data$truePositiveCount - FP <- data$falsePositiveCount - n <- data$positiveCount + data$negativeCount - - treatAll <- data$trueCount/n-data$falseCount/n*(pt/(1-pt)) - - if(!is.null(modelId[1])){ - netbenefit <- data.frame(modelId=modelId, pt=pt, netBenefit=TP/n-(FP/n)*(pt/(1-pt)), - treatAll=treatAll) - }else{ - netbenefit <- data.frame(pt=pt, netBenefit=TP/n-(FP/n)*(pt/(1-pt)), - treatAll=treatAll) - } - - return(netbenefit) -} diff --git a/inst/shiny/PLPViewer/modules/settings.R b/inst/shiny/PLPViewer/modules/settings.R deleted file mode 100644 index 4abbd26e2..000000000 --- a/inst/shiny/PLPViewer/modules/settings.R +++ /dev/null @@ -1,120 +0,0 @@ -settingsViewer <- function(id) { - ns <- shiny::NS(id) - shiny::div( - shiny::h3('Model Settings: ', - shiny::a("help", href="https://ohdsi.github.io/PatientLevelPrediction/reference/index.html", target="_blank") - ), - DT::dataTableOutput(ns('modelTable')), - - shiny::h3('Population Settings: ', - shiny::a("help", href="https://ohdsi.github.io/PatientLevelPrediction/reference/createStudyPopulation.html", target="_blank") - ), - DT::dataTableOutput(ns('populationTable')), - - shiny::h3('Covariate Settings: ', - shiny::a("help", href="http://ohdsi.github.io/FeatureExtraction/reference/createCovariateSettings.html", target="_blank") - ), - DT::dataTableOutput(ns('covariateTable')), - - shiny::h3("Hyper-parameters"), - DT::dataTableOutput(ns('hpTable')), - shiny::h3("Attrition"), - DT::dataTableOutput(ns('attritionTable')) - ) -} - -setingsServer <- function(id, plpResult) { - shiny::moduleServer( - id, - function(input, output, session) { - - # input tables - output$modelTable <- DT::renderDataTable( - formatModSettings(plpResult()$model$settings$modelSettings ) - ) - output$covariateTable <- DT::renderDataTable( - formatCovSettings(plpResult()$model$settings$covariateSettings) - ) - output$populationTable <- DT::renderDataTable( - formatPopSettings(plpResult()$model$settings$populationSettings) - ) - - output$hpTable <- DT::renderDataTable( - DT::datatable( - as.data.frame( - plpResult()$model[[ - which( - names(plpResult()$model) %in% c('validationDetails','trainDetails') - ) - ]]$hyperParamSearch - ), - options = list(scrollX = TRUE), - colnames = 'Fold AUROC' - ) - ) - - output$attritionTable <- DT::renderDataTable( - plpResult()$model[[ - which( - names(plpResult()$model) %in% c('validationDetails','trainDetails') - ) - ]]$attrition - ) - - - } - ) -} - - -# helpers -# format modelSettings -formatModSettings <- function(modelSettings){ - modelset <- data.frame(Setting = c(names(modelSettings$finalModelParameters)), - Value = c(unlist(lapply(modelSettings$finalModelParameters, - function(x) paste0(x, collapse=',')))) - ) - row.names(modelset) <- NULL - return(modelset) -} - -# format covariateSettings -formatCovSettings <- function(covariateSettings){ - - if(class(covariateSettings)=='covariateSettings'){ - covariateSettings <- list(covariateSettings) - } - - #code for when multiple covariateSettings - covariates <- c() - for(i in 1:length(covariateSettings)){ - covariatesTemp <- data.frame( - fun = attr(covariateSettings[[i]],'fun'), - setting = i, - covariateName = names(covariateSettings[[i]]), - SettingValue = unlist( - lapply( - covariateSettings[[i]], - function(x) paste0(x, collapse='-') - ) - ) - ) - covariates <- rbind(covariates,covariatesTemp) - } - row.names(covariates) <- NULL - return(covariates) -} - -# format populationSettings -formatPopSettings <- function(populationSettings){ - population <- populationSettings - population$attrition <- NULL # remove the attrition as result and not setting - population <- data.frame(Setting = names(population), - Value = unlist(lapply(population, - function(x) paste0(x, - collapse='-'))) - ) - row.names(population) <- NULL - return(population) -} - diff --git a/inst/shiny/PLPViewer/modules/summaryTable.R b/inst/shiny/PLPViewer/modules/summaryTable.R deleted file mode 100644 index 161828e9e..000000000 --- a/inst/shiny/PLPViewer/modules/summaryTable.R +++ /dev/null @@ -1,89 +0,0 @@ -summaryViewer <- function(id) { - ns <- shiny::NS(id) - DT::dataTableOutput(ns('summaryTable')) -} - -summaryServer <- function( - id, - summaryTable, - cNamesExclude = c('studyId','Analysis','analysisId','resultId','researcherId','addExposureDaysToStart','addExposureDaysToEnd', 'plpResultLocation', 'plpResultLoad') -) { - shiny::moduleServer( - id, - function(input, output, session) { - - # check if this makes drpdwn filter - summaryTable$T <- as.factor(summaryTable$T) - summaryTable$O <- as.factor(summaryTable$O) - - output$summaryTable <- DT::renderDataTable( - DT::datatable( - summaryTable[,!colnames(summaryTable)%in%cNamesExclude], - rownames= FALSE, - selection = 'single', - filter = 'top', - extensions = 'Buttons', - options = list( - dom = 'Blfrtip' , - buttons = c(I('colvis'), 'copy', 'excel', 'pdf' ), - scrollX = TRUE - #pageLength = 100, lengthMenu=c(10, 50, 100,200) - ), - - container = htmltools::withTags( - table( - class = 'display', - thead( - #tags$th(title=active_columns[i], colnames(data)[i]) - tr(apply( - data.frame( - colnames = c( - 'Dev', - 'Val', - 'T', - 'O', - 'Model', - 'Covariate setting', - 'TAR', - 'AUROC', - 'AUPRC', - 'T Size', - 'O Count', - 'Val (%)', - 'O Incidence (%)', - 'timeStamp' - ), - labels = c('Database used to develop the model', - 'Database used to evaluate model', - 'Target population - the patients you want to predict risk for', - 'Outcome - what you want to predict', - 'Model type', - 'Id for the covariate/settings used', - 'Time-at-risk period', - 'Area under the reciever operating characteristics (test or validation)', - 'Area under the precision recall curve (test or validation)', - 'Target population size in the data', - 'Outcome count in the data', - 'The percentage of data used to evaluate the model', - 'Percentage of target population that have outcome during time-at-risk', - 'date and time of execution') - ), 1, - function(x) th(title=x[2], x[1]) - ) - ) - ) - ) - ) - - ) - ) - - selectedRow <- shiny::reactive({ - input$summaryTable_rows_selected - }) - - return(selectedRow) - - } - ) -} diff --git a/inst/shiny/PLPViewer/modules/validation.R b/inst/shiny/PLPViewer/modules/validation.R deleted file mode 100644 index 8c30c17e5..000000000 --- a/inst/shiny/PLPViewer/modules/validation.R +++ /dev/null @@ -1,477 +0,0 @@ -validationViewer <- function(id) { - ns <- shiny::NS(id) - - shiny::div( - style = "font-size:70%", - shiny::p('Select one or more rows to generate comparison ROC and calibration plots'), - DT::dataTableOutput(ns('validationTable')), - - shiny::fluidRow( - shinydashboard::box( - status = 'info', - title = "Roc Plot", - solidHeader = TRUE, - shinycssloaders::withSpinner(shiny::plotOutput(ns('valRoc'))) - ), - shinydashboard::box( - status = 'info', - title = "Calibration Plot", - solidHeader = TRUE, - side = "right", - shinycssloaders::withSpinner(shiny::plotOutput(ns('valCal'))) - ) - ) - ) -} - -validationServer <- function( - id, - plpResult, - result, - validation, - inputType, - useDatabase, - summaryTable, - resultRow, - con, - mySchema, - connectionDetails, - targetDialect = NULL, - myTableAppend = NULL -) { - shiny::moduleServer( - id, - function(input, output, session) { - - if (useDatabase == F){ - validationTable <- shiny::reactive( - dplyr::filter( - summaryTable, - Analysis == summaryTable[resultRow(),'Analysis'] - ) - ) - } - else{ - # validationTable <- shiny::reactive(getValSummary(con, mySchema, summaryTable[filterIndex(),'Analysis'][trueRow()])) - validationTable <- shiny::reactive( - getValSummary( - con, - mySchema, - modelId = summaryTable[resultRow(),'Analysis'], - targetDialect = targetDialect, - myTableAppend = myTableAppend - ) - ) - } - - #shiny::reactive({print(validationTable())}) - #output$validationTable <- DT::renderDataTable(dplyr::select(validationTable(),c(Analysis, Dev, Val, AUC)), rownames= FALSE) - output$validationTable <- DT::renderDataTable({ - - if(nrow(validationTable())>0){ - - cind <- c('Analysis','T','O', 'Val', 'AUROC','calibrationInLarge intercept', 'T Size', 'O Count','Val (%)')%in%colnames(validationTable()) - validationTable()[,c('Analysis','T','O', 'Val', 'AUROC','calibrationInLarge intercept', 'T Size', 'O Count','Val (%)')[cind]] - } else{ - NULL - } - }, - escape = FALSE, - filter = 'top', - extensions = 'Buttons', - options = list( - dom = 'Blfrtip', - scrollX = TRUE - ), - rownames= FALSE - ) #options = list(filter = 'top')) - - # need to modify this for non-database results! - valtemplist <- list() - valResult <- shiny::reactive({ - - valTable <- validationTable()[input$validationTable_rows_selected,,] - if(nrow(valTable)>0){ - names <- valTable[, "Val"] - Ts <- valTable[, "T"] - Os <- valTable[, "O"] - for (i in 1:nrow(valTable)){ - - #make i reactive - iReact <- shiny::reactiveVal(i) - - valtemplist[[i]] <- getPlpResult( - result, - validation, - valTable, - inputType, - iReact, - val = T, - mySchema = mySchema, - connectionDetails = connectionDetails, - targetDialect = targetDialect, - myTableAppend = myTableAppend - ) - } - list( - results = valtemplist, - databaseName = names, - Ts=Ts, - Os=Os - ) - }else{ - list( - results = list(list()), - databaseName = '', - Ts='', - Os='' - ) - } - }) - - output$valRoc <- shiny::renderPlot({ - - if(is.null(valResult()$results[[1]]$performanceEvaluation)){ - return(NULL) - } else{ - plotRocs( - evaluationList = valResult()$results, - modelNames = paste0(1:length(valResult()$Ts),':',substr(valResult()$Ts,1,5),'-',substr(valResult()$Os,1,5),'-', substr(valResult()$databaseName,1,5)) - ) - } - }) - output$valCal <- shiny::renderPlot({ - - if(is.null(valResult()$results[[1]]$performanceEvaluation)){ - return(NULL) - } else{ - plotCalsSmooth( - evaluationList = valResult()$results, - modelNames = paste0(1:length(valResult()$Ts),':',substr(valResult()$Ts,1,5),'-',substr(valResult()$Os,1,5),'-', substr(valResult()$databaseName,1,5)) - ) - } - - }) - - } - ) -} - - - -# helper for multiple roc plots -plotRocs <- function( - evaluationList, - modelNames, - type= NULL, - fileName=NULL -){ - if(class(evaluationList)!='list'){ - stop('Need to enter a list') - } - - if("thresholdSummary" %in% names(evaluationList[[1]]) ){ - evaluationList <- evaluationList - } else if("performanceEvaluation" %in% names(evaluationList[[1]]) ){ - evaluationList <- lapply(evaluationList, function(x) x$performanceEvaluation) - } else { - stop('Wrong evaluationList') - } - - if(missing(modelNames)){ - modelNames <- paste0('Model ', 1:length(evaluationList)) - } - - createSteps <- function( - evaluation, - type, - name - ){ - - if(is.null(type)){ - if(length(unique(evaluation$thresholdSummary$evaluation)) > 1){ - ind <- evaluation$thresholdSummary$evaluation%in%c('Test','validation') - x<- evaluation$thresholdSummary[ind,c('falsePositiveRate','sensitivity')] - } else { - x<- evaluation$thresholdSummary[,c('falsePositiveRate','sensitivity')] - } - } else { - ind <- evaluation$thresholdSummary$evaluation == type - x <- evaluation$thresholdSummary[ind,c('falsePositiveRate','sensitivity')] - } - - x <- x[order(x$falsePositiveRate, x$sensitivity),] - - # add the bit to get the step - stepsExtra <- cbind(x[-1,1], x[-nrow(x),2]) - colnames(stepsExtra) <- colnames(x) - x <- rbind(c(1,1), x, stepsExtra, c(0,0)) - x <- x[order(x$falsePositiveRate, x$sensitivity),] - - x$model <- name - return(x) - } - - stepVals <- lapply( - 1:length(evaluationList), - function(i){ - createSteps( - evaluationList[[i]], - type=type[i], - name=modelNames[i] - ) - } - ) - data <- do.call(rbind, stepVals) - - plot <- ggplot2::ggplot( - data = data, - ggplot2::aes( - x = falsePositiveRate, - y = sensitivity, - color = model - ) - ) + - ggplot2::geom_polygon( - ggplot2::aes(fill = model), - alpha = 0.2 - ) + - ggplot2::geom_line(size = 1) + - ggplot2::geom_abline( - intercept = 0, - slope = 1, - linetype = 2 - ) + - ggplot2::scale_x_continuous( - "1 - specificity", - limits=c(0,1) - ) + - ggplot2::scale_y_continuous( - "Sensitivity", - limits=c(0,1) - ) + - ggplot2::scale_color_discrete(name = 'Result') + - ggplot2::scale_fill_discrete(guide = FALSE) - - if (!is.null(fileName)){ - ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400) - } - - return(plot) -} - -plotCals <- function( - evaluationList, - modelNames, - type = NULL, - fileName = NULL -){ - - if("calibrationSummary" %in% names(evaluationList[[1]]) ){ - evaluationList <- evaluationList - }else if("performanceEvaluation" %in% names(evaluationList[[1]]) ){ - evaluationList <- lapply(evaluationList, function(x) x$performanceEvaluation) - } else{ - stop('Wrong evaluationList') - } - - if(missing(modelNames)){ - modelNames <- paste0('Model ', 1:length(evaluationList)) - } - - calVal <- function( - evaluation, - type, - name - ){ - - if(is.null(type)){ - if(length(unique(evaluation$calibrationSummary$evaluation)) > 1){ - ind <- evaluation$calibrationSummary$evaluation%in%c('Test','validation') - x<- evaluation$calibrationSummary[ind,c('averagePredictedProbability','observedIncidence','PersonCountAtRisk')] - } else{ - x<- evaluation$calibrationSummary[,c('averagePredictedProbability','observedIncidence','PersonCountAtRisk')] - } - } else{ - ind <- evaluation$calibrationSummary$evaluation == type - x<- evaluation$calibrationSummary[ind,c('averagePredictedProbability','observedIncidence','PersonCountAtRisk')] - } - - cis <- apply( - x, - 1, - function(x){ - stats::binom.test( - x[2]*x[3], - x[3], - alternative = c("two.sided"), - conf.level = 0.95 - )$conf.int - } - ) - x$lci <- cis[1,] - x$uci <- cis[2,] - x$model <- name - return(x) - } - - calVal <- lapply(1:length(evaluationList), function(i) calVal(evaluationList[[i]], type=type[i], name=modelNames[i])) - data <- do.call(rbind, calVal) - - maxes <- max(max(data$averagePredictedProbability), max(data$observedIncidence))*1.1 - - limits <- ggplot2::aes( - ymax = uci, - ymin= lci - ) - - plot <- ggplot2::ggplot( - data = data, - ggplot2::aes( - x = averagePredictedProbability, - y = observedIncidence, - color = model - ) - ) + - ggplot2::geom_point(size=2) + - ggplot2::geom_errorbar(limits) + - ggplot2::geom_line() + - ggplot2::geom_abline( - intercept = 0, - slope = 1, - linetype = 5, - size=0.4, - show.legend = TRUE - ) + - ggplot2::scale_x_continuous("Average Predicted Probability") + - ggplot2::scale_y_continuous("Observed Fraction With Outcome") + - ggplot2::coord_cartesian( - xlim = c(0, maxes), - ylim = c(0,maxes) - ) + - ggplot2::scale_color_discrete(name = 'Result') - - if (!is.null(fileName)){ - ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400) - } - - return(plot) -} - - - -plotCalsSmooth <- function( - evaluationList, - modelNames, - type = NULL - ){ - - if("calibrationSummary" %in% names(evaluationList[[1]]) ){ - evaluationList <- evaluationList - }else if("performanceEvaluation" %in% names(evaluationList[[1]]) ){ - evaluationList <- lapply(evaluationList, function(x) x$performanceEvaluation) - } else{ - stop('Wrong evaluationList') - } - - if(missing(modelNames)) - modelNames <- paste0('Model ', 1:length(evaluationList)) - - calVal <- function( - evaluation, - type, - name - ){ - - if(is.null(type)){ - if(length(unique(evaluation$calibrationSummary$evaluation)) > 1){ - ind <- evaluation$calibrationSummary$evaluation %in% c('Test','validation') - data <- evaluation$calibrationSummary[ind,c('averagePredictedProbability','observedIncidence','PersonCountAtRisk')] - } else{ - data <- evaluation$calibrationSummary[,c('averagePredictedProbability','observedIncidence','PersonCountAtRisk')] - } - } else{ - ind <- evaluation$calibrationSummary$evaluation==type - data <- evaluation$calibrationSummary[ind,c('averagePredictedProbability','observedIncidence','PersonCountAtRisk')] - } - - maxes <- max(max(data$averagePredictedProbability), max(data$observedIncidence))*1.1 - - fit <- stats::loess(data$observedIncidence ~ data$averagePredictedProbability, degree = 1) - smoothData <- data.frame( - p = seq(0,maxes,0.0001), - y = stats::predict(fit, seq(0,maxes,0.0001)), - model = name - ) - smoothData <- smoothData[!is.na(smoothData$y),] - - return(smoothData) - } - - getVal <- function( - evaluation, - type, - name - ){ - - if(is.null(type)){ - if(length(unique(evaluation$calibrationSummary$evaluation)) > 1){ - ind <- evaluation$calibrationSummary$evaluation %in% c('Test','validation') - data <- evaluation$calibrationSummary[ind, c('averagePredictedProbability','observedIncidence')] - } else{ - data <- evaluation$calibrationSummary[, c('averagePredictedProbability','observedIncidence')] - } - } else{ - ind <- evaluation$calibrationSummary$evaluation == type - data <- evaluation$calibrationSummary[ind, c('averagePredictedProbability','observedIncidence')] - } - - - values <- data.frame( - p = data$averagePredictedProbability, - y = data$observedIncidence, - model = name - ) - - values <- values[seq(1, nrow(values), 10),] - - return(values) - } - - calVal<- lapply(1:length(evaluationList), function(i) calVal(evaluationList[[i]], type=type[i], name=modelNames[i])) - smoothData <- do.call(rbind, calVal) - - values <- do.call(rbind,lapply(1:length(evaluationList), function(i) getVal(evaluationList[[i]], type=type[i], name=modelNames[i]))) - - plot <- ggplot2::ggplot( - data = smoothData, - ggplot2::aes( - x = .data$p, - y = .data$y, - color = .data$model - ) - ) + - ggplot2::geom_line( - ggplot2::aes( - linetype = "Loess"), - size = 1, - show.legend = T - ) + - ggplot2::geom_abline( - intercept = 0, - slope = 1, - linetype = 5, - size=0.4, - show.legend = TRUE, - color = "black" - ) + - ggplot2::geom_point(data = values) + - ggplot2::labs( - x = "Average Predicted Probability", - y = "Observed Fraction With Outcome" - ) - #ggplot2::scale_color_discrete(name = 'Result') - - return(plot) -} - diff --git a/inst/shiny/PLPViewer/processing.R b/inst/shiny/PLPViewer/processing.R deleted file mode 100644 index 3c04c4b77..000000000 --- a/inst/shiny/PLPViewer/processing.R +++ /dev/null @@ -1,297 +0,0 @@ -settingsNames <- c('analysisId','modelSettings','covariateSetting', 'targetName', 'outcomeName', - 'populationSetting','modelSettingName') - -getSummary <- function(result,inputType,validation){ - if(inputType == 'plpResult' || inputType == 'plpNoClass'){ - - sumTab <- getSummaryFromObject(result) - - if(!is.null(validation)){ # what about a list of validations? - sumTab <- rbind(sumTab, getSummaryFromObject(validation)) - } - - } else if( inputType == 'file') { - sumTab <- summaryPlpAnalyses(result) - } - - #remove empty rows - emptyInd <- is.na(sumTab[,'AUROC']) - if(sum(emptyInd)>0){ - sumTab <- sumTab[!emptyInd,] - } - - columnsOfInt <- c('analysisId', - 'devDatabase', 'valDatabase', - 'targetName', 'outcomeName', - 'modelSettingName','covariateSetting', - 'TAR', 'AUROC','AUPRC', - 'populationSize', 'outcomeCount', - 'valPercent', 'incidence', - 'timeStamp', 'plpResultLocation', 'plpResultLoad' - ) - - # add missing columns - if(sum(!columnsOfInt %in% colnames(sumTab))>0){ - missInd <- columnsOfInt[!columnsOfInt %in% colnames(sumTab)] - for(i in 1:length(missInd)){ - sumTab[,missInd[i]] <- 'NA' - } - } - - sumTab <- sumTab[,columnsOfInt] - - colnames(sumTab) <- c( - 'Analysis','Dev', 'Val', 'T', 'O','Model','covariateSetting', - 'TAR', 'AUROC','AUPRC', 'T Size','O Count','% used for Eval','O Incidence (%)', - 'timeStamp', 'plpResultLocation', 'plpResultLoad' - ) - - return(sumTab) -} - - -getSummaryFromObject <- function(result, analysisId = NULL){ - - timeV <- ifelse(is.null(result$executionSummary$ExecutionDateTime), '2000-01-01', result$executionSummary$ExecutionDateTime) - - TAR <- getTAR(result$model$settings$populationSettings) - eval <- as.data.frame(result$performanceEvaluation$evaluationStatistics) - - for(i in 1:ncol(eval)){ - eval[,i] <- unlist(eval[,i]) - } - - if(is.null(eval$evaluation)){ - eval$evaluation <- 'Test' - } - if(length(unique(eval$evaluation)) == 1){ - eval$evaluation <- 'Test' - } - - eval <- tidyr::pivot_wider( - data = eval %>% - dplyr::mutate(variable = paste(.data$evaluation, .data$metric, sep = '_')) %>% - dplyr::select(-.data$evaluation, -.data$metric), - names_from = 'variable', - values_from = 'value' - ) - - #eval <- reshape2::dcast( - # data = eval, - # formula = . ~ evaluation + metric, - # value.var = 'value' - # ) - - AUC <- paste0( - signif(as.double(eval$Test_AUROC),3), - ' (', - signif(as.double(eval$`Test_95% lower AUROC`),3), - '-', - signif(as.double(eval$`Test_95% upper AUROC`),3), - ')' - ) - - AUPRC <- signif(as.double(eval$Test_AUPRC),3) - - populationSize <- ifelse(is.null(eval$Train_populationSize), 0, as.double(eval$Train_populationSize)) + as.double(eval$Test_populationSize) - outcomeCount <- ifelse(is.null(eval$Train_outcomeCount), 0, as.double(eval$Train_outcomeCount)) + as.double(eval$Test_outcomeCount) - valPercent <- round(as.double(eval$Test_populationSize)/populationSize*100) - incidence <- signif(100*outcomeCount/populationSize ,3) - - if(!is.null(result$model$trainDetails)){ - devDatabase <- ifelse(is.null(result$model$trainDetails$cdmDatabaseSchema),'Missing',result$model$trainDetails$cdmDatabaseSchema) - valDatabase <- devDatabase - } else{ - devDatabase <- ifelse(is.null(result$model$validationDetails$developmentDatabase),'Missing',result$model$validationDetails$developmentDatabase) - valDatabase <- ifelse(is.null(result$model$validationDetails$cdmDatabaseSchema),'Missing',result$model$validationDetails$cdmDatabaseSchema) -} - allRes <- data.frame(analysisId = ifelse(is.null(analysisId), ifelse(is.null(result$analysisRef$analysisId), 'None', result$analysisRef$analysisId), analysisId), - devDatabase = devDatabase, - valDatabase = valDatabase, - cohortName = 'T', # needed? - outcomeName = 'O', # needed? - modelSettingName = result$model$settings$modelSettings$model, - #covariateSetting = 1, - TAR = TAR, - AUROC = AUC, - AUPRC = AUPRC, - populationSize = populationSize, - outcomeCount = outcomeCount, - valPercent = valPercent, - incidence = incidence, - timeStamp = timeV, - plpResultLocation = 'NULL', - plpResultLoad = 'NULL' - ) - - return(allRes) -} - - - -summaryPlpAnalyses <- function(analysesLocation){ - # loads the analyses and validations to get summaries - #======================================== - settings <- utils::read.csv(file.path(analysesLocation,'settings.csv')) - settings <- settings[,!colnames(settings)%in%c('plpDataFolder','studyPopFile','plpResultFolder')] - settings$analysisId <- gsub('Analysis_','', settings$analysisId) # fixing if Analysis_id in settings - settings$analysisId <- paste0('Analysis_', settings$analysisId) - - analysisIds <- dir(file.path(analysesLocation), recursive = F, full.names = T) - analysisIds <- analysisIds[grep('Analysis_',analysisIds)] - - devPerformance <- do.call(rbind,lapply(file.path(analysisIds), getPerformance)) - - # updated this - devPerformance <- merge(settings[,settingsNames[settingsNames %in% colnames(settings)]], - devPerformance[, !colnames(devPerformance) %in% c('cohortName','outcomeName')[c('cohortName','outcomeName') %in% colnames(settings)]], by='analysisId', all.x=T) - - validationLocation <- file.path(analysesLocation,'Validation') - if(length(dir(validationLocation))>0){ - valPerformances <- c() - valDatabases <- dir(validationLocation, recursive = F, full.names = T) - if(length(grep('plplog.txt', valDatabases))>0){ - valDatabases <- valDatabases[-grep('plplog.txt', valDatabases)] - } - for(valDatabase in valDatabases){ - - valAnalyses <- dir(valDatabase, recursive = F, full.names = T) - valAnalyses <- valAnalyses[grep('Analysis_', valAnalyses)] - valPerformance <- do.call(rbind,lapply(file.path(valAnalyses), function(x) getPerformance(x))) - - valSettings <- settings[,settingsNames[settingsNames %in% colnames(settings)]] # removed TAR bits - valPerformance <- merge( - valSettings, - valPerformance[, !colnames(valPerformance) %in% c('cohortName','outcomeName')[c('cohortName','outcomeName') %in% colnames(settings)]], - by='analysisId' - ) - valPerformance <- valPerformance[,colnames(devPerformance)] # make sure same order - valPerformances <- rbind(valPerformances, valPerformance) - } - - if(ncol(valPerformances)==ncol(devPerformance)){ - allPerformance <- rbind(devPerformance,valPerformances) - } else{ - stop('Issue with dev and val performance data.frames') - } - } else { - allPerformance <- devPerformance - } - - return(allPerformance) -} - -getPerformance <- function(analysisLocation){ - - getType <- function(analysisLocation){ - - if(file.exists(file.path(analysisLocation, 'plpResult.rds'))){ - return('rds') - } - - if(file.exists(file.path(analysisLocation, 'validationResult.rds'))){ - return('rds') - } - - if(dir.exists(file.path(analysisLocation, 'plpResult'))){ - return('runPlp') - } - - if(dir.exists(file.path(analysisLocation, 'validationResult'))){ - return('runPlp') - } - - - if(dir.exists(file.path(analysisLocation, 'performanceEvaluation'))){ - return('csv') - } - - return('none') - - } - - analysisId <- strsplit(analysisLocation, '/')[[1]] - analysisId <- analysisId[length(analysisId)] - - type <- getType(analysisLocation) # csv, rds, runPlp - print(type) - - if(type == 'csv'){ - - require(PatientLevelPrediction) - res <- PatientLevelPrediction::loadPlpShareable(file.path(analysisLocation)) - result <- getSummaryFromObject(result = res, analysisId = analysisId) - location <- file.path(analysisLocation) - plpResultLoad <- 'loadPlpShareable' - - } else if(type == 'rds'){ - - # read rds here - res <- readRDS(file.path(analysisLocation,'plpResult.rds')) - result <- getSummaryFromObject(result = res, analysisId = analysisId) - location <- file.path(analysisLocation, 'plpResult.rds') - plpResultLoad <- 'readRDS' - - } else if(type == 'runPlp'){ - - location <- file.path(analysisLocation, 'plpResult') - if(!dir.exists(location)){ - location <- file.path(analysisLocation, 'validationResult') - } - - require(PatientLevelPrediction) - res <- loadPlpResult(location) - result <- getSummaryFromObject(result = res, analysisId = analysisId) - plpResultLoad <- 'loadPlpResult' - - } else{ - # return empty result - analysisId <- strsplit(analysisLocation, '/')[[1]] - result <- data.frame( - AnalysisId = analysisId, - devDatabase = 'missing', - valDatabase = 'missing', - targetName = 'T', # NEEDED? - outcomeName = 'O', # NEEDED? - modelSettingName = 'none', - #covariateSetting = 1, - TAR = '?', - AUROC = 0.000, - AUPRC = 0, - populationSize = 0, - outcomeCount = 0, - valPercent = 0, - incidence = 0, - timeStamp = as.Date('1900-01-01'), - plpResultLocation = '', - plpResultLoad = 'loadPlpResult' - ) - location <- '' - plpResultLoad <- 'loadPlpResult' - - } - - result$plpResultLocation <- location - result$plpResultLoad <- plpResultLoad - - #remove settings - result[,!colnames(result) %in% settingsNames] - - return(result) -} - - -getTAR <- function(x){ - starttar <- unique(x$startAnchor) - if(is.null(starttar)){ - starttar <- ifelse(unique(x$addExposureDaysToStart), 'cohort end','cohort start') - } - endtar <- unique(x$endAnchor) - if(is.null(endtar)){ - endtar <- ifelse(unique(x$addExposureDaysToEnd), 'cohort end','cohort start') - } - TAR <- paste0('(',starttar,' + ',unique(x$riskWindowStart),') - (', endtar,' + ',unique(x$riskWindowEnd),')') - return(TAR) - #return('cohort start + 1 - cohort start + 365') -} - diff --git a/inst/shiny/PLPViewer/server.R b/inst/shiny/PLPViewer/server.R deleted file mode 100644 index afe359d6a..000000000 --- a/inst/shiny/PLPViewer/server.R +++ /dev/null @@ -1,169 +0,0 @@ -# @file server.R -# -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -source("helpers.R") -source("emptyPlot.R") - -source("modules/summaryTable.R") -source("modules/covariateSummary.R") -source("modules/settings.R") -source("modules/cutoff.R") -source("modules/discrimination.R") -source("modules/calibration.R") -source("modules/netBenefit.R") -source("modules/validation.R") -source("modules/download.R") - -server <- shiny::shinyServer(function(input, output, session) { - session$onSessionEnded(shiny::stopApp) - - #============= - # sidebar menu - #============= - if(useDatabase == F){ - - output$sidebarMenu <- shinydashboard::renderMenu(shinydashboard::sidebarMenu(id ='menu', - addInfo(shinydashboard::menuItem("Description", tabName = "Description", icon = shiny::icon("home")), "DescriptionInfo"), - addInfo(shinydashboard::menuItem("Results", tabName = "Summary", icon = shiny::icon("table")), "SummaryInfo"), - #addInfo(shinydashboard::menuItem("Log", tabName = "Log", icon = shiny::icon("list")), "LogInfo"), - addInfo(shinydashboard::menuItem("Data Info", tabName = "DataInfo", icon = shiny::icon("database")), "DataInfoInfo"), - addInfo(shinydashboard::menuItem("Help", tabName = "Help", icon = shiny::icon("info")), "HelpInfo") - )) - } else { - - shiny::observe({ - studyId <- shiny::parseQueryString(session$clientData$url_search)[['studyId']] - - print(paste0('StudyId: ', studyId)) - if(!is.null(studyId)){ - summaryTable <- summaryTable[summaryTable$studyId == studyId, ] - } - - }) - - output$sidebarMenu <- shinydashboard::renderMenu(shinydashboard::sidebarMenu(id ='menu', - addInfo(shinydashboard::menuItem("Description", tabName = "Description", icon = shiny::icon("home")), "DescriptionInfo"), - addInfo(shinydashboard::menuItem("Results", tabName = "Summary", icon = shiny::icon("table")), "SummaryInfo"), - addInfo(shinydashboard::menuItem("Data Info", tabName = "DataInfo", icon = shiny::icon("database")), "DataInfoInfo"), - addInfo(shinydashboard::menuItem("Help", tabName = "Help", icon = shiny::icon("info")), "HelpInfo") - )) - } - - # =========================================== - # RESULT viewer - # =========================================== - - # use the summary module to select a result via row selection - resultRow <- summaryServer('sumTab', summaryTable) - - # change to single model explore tab when summary table row is selected - shiny::observeEvent(resultRow(), { - shiny::updateTabsetPanel(session, "allView", selected = "Explore Selected Model") - }) - - # this loads all the results - plpResult <- shiny::reactive({getPlpResult(result, - validation, - summaryTable, - inputType, - val = F, - resultRow, - mySchema = mySchema, - connectionDetails = connectionDetails, - targetDialect = targetDialect, - myTableAppend = myTableAppend)}) - - - # =========================================== - # Single Result Exploring Modules - # =========================================== - - covariateSummaryServer('covariateSummary', - plpResult, - summaryTable, - resultRow, - mySchema, - con, - inputSingleView = input$singleView, - myTableAppend = myTableAppend, - targetDialect = targetDialect) - - setingsServer('settings', - plpResult) - - cutoffServer('cutoff', - plpResult) - - discriminationServer('discrimination', - plpResult) - - calibrationServer('calibration', - plpResult) - - nbServer('netBenefit', - plpResult) - - validationServer('validation', - result, - validation, - plpResult = plpResult, - inputType = inputType, - useDatabase = useDatabase, - summaryTable = summaryTable, - resultRow = resultRow, - con = con, - mySchema = mySchema, - connectionDetails = connectionDetails, - myTableAppend = myTableAppend, - targetDialect = targetDialect) - - - downloadServer('download') - #======================= - # get researcher info - #======================= - output$researcherInfo <- shiny::renderTable(plpResult()$researcherInfo) - - # HELPER INFO - shiny::observeEvent(input$DescriptionInfo, { - showInfoBox("Description", "html/Description.html") - }) - shiny::observeEvent(input$SummaryInfo, { - showInfoBox("Summary", "html/Summary.html") - }) - shiny::observeEvent(input$PerformanceInfo, { - showInfoBox("Performance", "html/Performance.html") - }) - shiny::observeEvent(input$ModelInfo, { - showInfoBox("Model", "html/Model.html") - }) - shiny::observeEvent(input$LogInfo, { - showInfoBox("Log", "html/Log.html") - }) - shiny::observeEvent(input$SettingsInfo, { - showInfoBox("Settings", "html/Settings.html") - }) - shiny::observeEvent(input$DataInfoInfo, { - showInfoBox("DataInfo", "html/DataInfo.html") - }) - shiny::observeEvent(input$HelpInfo, { - showInfoBox("HelpInfo", "html/Help.html") - }) - - -}) diff --git a/inst/shiny/PLPViewer/ui.R b/inst/shiny/PLPViewer/ui.R deleted file mode 100644 index ebee9550f..000000000 --- a/inst/shiny/PLPViewer/ui.R +++ /dev/null @@ -1,163 +0,0 @@ -# @file Ui.R -# -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -source("modules/summaryTable.R") -source("modules/covariateSummary.R") -source("modules/settings.R") -source("modules/cutoff.R") -source("modules/discrimination.R") -source("modules/calibration.R") -source("modules/netBenefit.R") -source("modules/validation.R") -source("modules/download.R") - -addInfo <- function(item, infoId) { - infoTag <- tags$small( - class = "badge pull-right action-button", - style = "padding: 1px 6px 2px 6px; background-color: steelblue;", - type = "button", - id = infoId, - "i" - ) - item$children[[1]]$children <- append(item$children[[1]]$children, list(infoTag)) - return(item) -} - -ui <- shinydashboard::dashboardPage( - skin = 'black', - - shinydashboard::dashboardHeader( - title = "PLP Viewer", - tags$li( - div( - img( - src = 'logo.png', - title = "OHDSI PLP", - height = "40px", - width = "40px"), - style = "padding-top:0px; padding-bottom:0px;" - ), - class = "dropdown" - ) - ), - - shinydashboard::dashboardSidebar( - shinydashboard::sidebarMenuOutput("sidebarMenu") - ), # end sidebar - - shinydashboard::dashboardBody( - shinydashboard::tabItems( - - # help tab - shinydashboard::tabItem( - tabName = "Help", - shiny::h2("Information"), - shiny::p("Click on a row to explore the results for that model. When you wish to explore a different model, then select the new result row and the tabs will be updated."), - shiny::a("Demo Video", href = 'https://youtu.be/StpV40yl1UE', target='_blank') - ), - - # First tab content - shinydashboard::tabItem( - tabName = "Description", - shiny::includeMarkdown(path = pathToMd) - ), - shinydashboard::tabItem( - tabName = "DataInfo", - shiny::includeMarkdown(path = "./www/dataInfo.md") - ), - shinydashboard::tabItem( - tabName = "Summary", - # do this inside tabs: - shiny::tabsetPanel( - id = 'allView', - shiny::tabPanel( - "All Models Summary", - summaryViewer('sumTab') - ), - - shiny::tabPanel( - "Explore Selected Model", - - shiny::tabsetPanel( - id = 'singleView', - shiny::tabPanel( - "Development Settings", - settingsViewer('settings') - ), - - shiny::tabPanel( - "Model", - covariateSummaryViewer('covariateSummary') - ), - - shiny::tabPanel( - "Threshold Dependant", - cutoffViewer('cutoff') - ), - - shiny::tabPanel( - "Discrimination", - discriminationViewer('discrimination') - ), - - shiny::tabPanel( - "Calibration", - calibrationViewer('calibration') - ), - - shiny::tabPanel( - "Net Benefit", - nbViewer('netBenefit') - ), - - shiny::tabPanel( - "Validation", - validationViewer('validation') - ), - - shiny::tabPanel( - "Developer Info", - shinydashboard::box(status = 'info', - title = "Developer Info", - solidHeader = TRUE, - side = "right", - shiny::tableOutput('researcherInfo') - ) - ), - - shiny::tabPanel( - "Download Model", - downloadViewer('download') - ) - - ) - ) - - - - - ) - ) - - - - - - ) - ) -) diff --git a/inst/shiny/PLPViewer/www/about.png b/inst/shiny/PLPViewer/www/about.png deleted file mode 100644 index 878a509e2..000000000 Binary files a/inst/shiny/PLPViewer/www/about.png and /dev/null differ diff --git a/inst/shiny/PLPViewer/www/custom.css b/inst/shiny/PLPViewer/www/custom.css deleted file mode 100644 index 26730bbd9..000000000 --- a/inst/shiny/PLPViewer/www/custom.css +++ /dev/null @@ -1,81 +0,0 @@ -.plotly.html-widget.html-widget-output.shiny-bound-output.js-plotly-plot { - z-index: 22; - position: relative; -} - -.plotlybars { - padding: 0 10px; - vertical-align: bottom; - width: 100%; - height: 100%; - overflow: hidden; - position: relative; - box-sizing: border-box; -} - -.plotlybars-wrapper { - width: 165px; - height: 100px; - margin: 0 auto; - left: 0; - right: 0; - position: absolute; - z-index: 1; -} - -.plotlybars-text { - color: #447adb; - font-family: 'Open Sans', verdana, arial, sans-serif; - font-size: 80%; - text-align: center; - margin-top: 5px; -} - -.plotlybars-bar { - background-color: #447adb; - height: 100%; - width: 13.3%; - position: absolute; - - -webkit-transform: translateZ(0); - transform: translateZ(0); - - animation-duration: 2s; - animation-iteration-count: infinite; - animation-direction: normal; - animation-timing-function: linear; - - -webkit-animation-duration: 2s; - -webkit-animation-iteration-count: infinite; - -webkit-animation-direction: normal; - -webkit-animation-timing-function: linear; -} - -.b1 { left: 0%; top: 88%; animation-name: b1; -webkit-animation-name: b1; } -.b2 { left: 14.3%; top: 76%; animation-name: b2; -webkit-animation-name: b2; } -.b3 { left: 28.6%; top: 16%; animation-name: b3; -webkit-animation-name: b3; } -.b4 { left: 42.9%; top: 40%; animation-name: b4; -webkit-animation-name: b4; } -.b5 { left: 57.2%; top: 26%; animation-name: b5; -webkit-animation-name: b5; } -.b6 { left: 71.5%; top: 67%; animation-name: b6; -webkit-animation-name: b6; } -.b7 { left: 85.8%; top: 89%; animation-name: b7; -webkit-animation-name: b7; } - -@keyframes b1 { 0% { top: 88%; } 44% { top: 0%; } 94% { top: 100%; } 100% { top: 88%; } } -@-webkit-keyframes b1 { 0% { top: 88%; } 44% { top: 0%; } 94% { top: 100%; } 100% { top: 88%; } } - -@keyframes b2 { 0% { top: 76%; } 38% { top: 0%; } 88% { top: 100%; } 100% { top: 76%; } } -@-webkit-keyframes b2 { 0% { top: 76%; } 38% { top: 0%; } 88% { top: 100%; } 100% { top: 76%; } } - -@keyframes b3 { 0% { top: 16%; } 8% { top: 0%; } 58% { top: 100%; } 100% { top: 16%; } } -@-webkit-keyframes b3 { 0% { top: 16%; } 8% { top: 0%; } 58% { top: 100%; } 100% { top: 16%; } } - -@keyframes b4 { 0% { top: 40%; } 20% { top: 0%; } 70% { top: 100%; } 100% { top: 40%; } } -@-webkit-keyframes b4 { 0% { top: 40%; } 20% { top: 0%; } 70% { top: 100%; } 100% { top: 40%; } } - -@keyframes b5 { 0% { top: 26%; } 13% { top: 0%; } 63% { top: 100%; } 100% { top: 26%; } } -@-webkit-keyframes b5 { 0% { top: 26%; } 13% { top: 0%; } 63% { top: 100%; } 100% { top: 26%; } } - -@keyframes b6 { 0% { top: 67%; } 33.5% { top: 0%; } 83% { top: 100%; } 100% { top: 67%; } } -@-webkit-keyframes b6 { 0% { top: 67%; } 33.5% { top: 0%; } 83% { top: 100%; } 100% { top: 67%; } } - -@keyframes b7 { 0% { top: 89%; } 44.5% { top: 0%; } 94.5% { top: 100%; } 100% { top: 89%; } } -@-webkit-keyframes b7 { 0% { top: 89%; } 44.5% { top: 0%; } 94.5% { top: 100%; } 100% { top: 89%; } } diff --git a/inst/shiny/PLPViewer/www/dataInfo.md b/inst/shiny/PLPViewer/www/dataInfo.md deleted file mode 100644 index aacec89c0..000000000 --- a/inst/shiny/PLPViewer/www/dataInfo.md +++ /dev/null @@ -1,26 +0,0 @@ -# Data Information - -# - -The following databases were used in this study: - - - -| Database | Name | Country | Type | Years -|--------|----------|-----|-----|-----| -| -*OptumDoD* | Optum® De-Identified Clinformatic Data Mart Database – Date of Death (DOD)     | USA     | Claims     | 2000-2019 | -| -*CCAE*     | IBM MarketScan® Commercial Database     | USA     | Claims | 2000-2019 | -| -*MDCD*     | IBM MarketScan® Multi-State Medicaid Database     | USA     | Claims | 2006-2019 | -| -*MDCR*     | IBM MarketScan® Medicare Supplemental Database     | USA     | Claims | 2000-2019 | -| -*JMDC*     | Japan Medical Data Center     | Japan     | Claims     | 2000-2019 | -| -*optumEhr*     | Optum® de-identified Electronic Health Record Dataset     | USA | EHR     | 2006-2019 | -| - - -All databases obtained IRB approval or used deidentified data that was considered exempt from IRB approval. diff --git a/inst/shiny/PLPViewer/www/favicon.ico b/inst/shiny/PLPViewer/www/favicon.ico deleted file mode 100644 index 849a1fa44..000000000 Binary files a/inst/shiny/PLPViewer/www/favicon.ico and /dev/null differ diff --git a/inst/shiny/PLPViewer/www/libraryDescription.md b/inst/shiny/PLPViewer/www/libraryDescription.md deleted file mode 100644 index e9f4976e0..000000000 --- a/inst/shiny/PLPViewer/www/libraryDescription.md +++ /dev/null @@ -1,21 +0,0 @@ -### OHDSI PatientLevelPrediction Model Repository - -This shiny application is an interface for viewing, uploading and downloading healthcare prediction models. - -Our aim is to improve the reproducibility and transparency of prediction models. Reproducibiliy is improved by the ability to access code that can be run against any OMOP common data model data to implement and evaluate any model in this repository. Transparency is improved by providing all the details for the model development and performance within the app. This will help us decide which models to trust by examining extensive external validation results. - -### How to use? - -Click on the 'Library' option in the left-hand menu to explore the models in the repository. You can select a model to explore further by selecting the corresponding row in the library table. This will then populate the settings/performance/model views. - -### How to download a model? - -Click on the 'download development' to download an R package that can be executed on any OMOP common data model data to replicate the model development process in your data. Click the 'download validation' to download an R package that can be executed on any OMOP common data model data to validate the choosen model in your data. You can then upload the new model or validation results to the repo. - -### How to upload results? - -Click the 'upload' left-hand menu option, fill out the form and click submit. The results will be reviewed and you will recieve and email once the results are in the repository. - -### Questions? - -Please send any questions to: ...@...com diff --git a/inst/shiny/PLPViewer/www/logo.png b/inst/shiny/PLPViewer/www/logo.png deleted file mode 100644 index c6307af6b..000000000 Binary files a/inst/shiny/PLPViewer/www/logo.png and /dev/null differ diff --git a/inst/shiny/PLPViewer/www/shinyAbout.md b/inst/shiny/PLPViewer/www/shinyAbout.md deleted file mode 100644 index 9d7af1821..000000000 --- a/inst/shiny/PLPViewer/www/shinyAbout.md +++ /dev/null @@ -1,24 +0,0 @@ -Introduction: - -Development Status: - -Below is the abstract of the manuscript that summarizes the findings: - -Background: - -Methods: - -Results: - -Discussion: - - -Below are links for study-related artifacts that have been made available as part of this study: - -**Protocol:** - -### Packages ### - -- OHDSI model development: -- OHDSI model validation: -- Existing model validation: diff --git a/inst/shiny/PLPViewer/www/shinyDescription.md b/inst/shiny/PLPViewer/www/shinyDescription.md deleted file mode 100644 index 40ea11e43..000000000 --- a/inst/shiny/PLPViewer/www/shinyDescription.md +++ /dev/null @@ -1,49 +0,0 @@ -### PLP Study Title - -**Development Status: Under Development** - - - -### Information - - - -This shiny application contains the results of [add information about the study]. - -During manuscript development and the subsequent review period, these results are considered under embargo and should not be disclosed without explicit permission and consent from the authors. - - - -Below are links for study-related artifacts that have been made available as part of this study: - - - -**Protocol:** [link](https://...) - - - -### Abstract - - -Below is the abstract of the manuscript that summarizes the findings: - - - -**Background:** - - -**Methods:** - -**Findings:** - - -**Interpretation:** - - - -### Study Packages - -- Model validation: [link](https://github.com/ohdsi-studies/...) - - - diff --git a/inst/shinyConfig.json b/inst/shinyConfig.json new file mode 100644 index 000000000..f2a674932 --- /dev/null +++ b/inst/shinyConfig.json @@ -0,0 +1,32 @@ +{ + "shinyModules": [ + { + "id": "about", + "tabName": "About", + "tabText": "About", + "shinyModulePackage": "OhdsiShinyModules", + "uiFunction": "aboutViewer", + "serverFunction": "aboutServer", + "databaseConnectionKeyService": null, + "databaseConnectionKeyUsername": null, + "infoBoxFile": "aboutHelperFile()", + "icon": "info", + "keyring": true, + "order": 1 + }, + { + "id": "prediction", + "tabName": "Prediction", + "tabText": "Prediction", + "shinyModulePackage": "OhdsiShinyModules", + "uiFunction": "predictionViewer", + "serverFunction": "predictionServer", + "databaseConnectionKeyService": "resultDatabaseDetails", + "databaseConnectionKeyUsername": "prediction", + "infoBoxFile": "predictionHelperFile()", + "icon": "chart-line", + "keyring": false, + "order": 2 + } + ] +} diff --git a/inst/shinyConfigUpdate.json b/inst/shinyConfigUpdate.json new file mode 100644 index 000000000..95756dc68 --- /dev/null +++ b/inst/shinyConfigUpdate.json @@ -0,0 +1,26 @@ +{ + "shinyModules": [ + { + "id": "about", + "tabName": "About", + "tabText": "About", + "shinyModulePackage": "OhdsiShinyModules", + "uiFunction": "aboutViewer", + "serverFunction": "aboutServer", + "infoBoxFile": "aboutHelperFile()", + "icon": "info", + "order": 1 + }, + { + "id": "prediction", + "tabName": "Prediction", + "tabText": "Prediction", + "shinyModulePackage": "OhdsiShinyModules", + "uiFunction": "patientLevelPredictionViewer", + "serverFunction": "patientLevelPredictionServer", + "infoBoxFile": "patientLevelPredictionHelperFile()", + "icon": "chart-line", + "order": 2 + } + ] +} diff --git a/inst/sql/postgresql/Migration_1-store_version.sql b/inst/sql/postgresql/Migration_1-store_version.sql new file mode 100644 index 000000000..bd4ab3a61 --- /dev/null +++ b/inst/sql/postgresql/Migration_1-store_version.sql @@ -0,0 +1,16 @@ +-- Database migrations for verion 6.0.10 +-- This migration updates the schema: + -- 1. to store the patient level prediction version + -- 2. Add a migrations table for supporting database migrations + +{DEFAULT @package_version = package_version} +{DEFAULT @migration = migration} +{DEFAULT @table_prefix = ''} + +-- Create table indicating version number of ddl +DROP TABLE IF EXISTS @database_schema.@table_prefix@package_version; + +--HINT DISTRIBUTE ON RANDOM +CREATE TABLE @database_schema.@table_prefix@package_version ( + version_number VARCHAR(50) PRIMARY KEY +); \ No newline at end of file diff --git a/inst/sql/sql_server/CreateCohorts.sql b/inst/sql/sql_server/CreateCohorts.sql index 141d9da92..20279339f 100644 --- a/inst/sql/sql_server/CreateCohorts.sql +++ b/inst/sql/sql_server/CreateCohorts.sql @@ -17,7 +17,7 @@ limitations under the License. {DEFAULT @cohort_database_schema = 'CDM_SIM' } {DEFAULT @cohort_table = 'drug_era' } {DEFAULT @cdm_version = '5'} -{DEFAULT @cohort_id = '' } +{DEFAULT @target_id = '' } {DEFAULT @study_start_date = '' } {DEFAULT @study_end_date = '' } {DEFAULT @first_only = FALSE} @@ -77,15 +77,15 @@ FROM ( -- first_only } FROM ( -- raw_cohorts SELECT subject_id, - @cohort_id AS cohort_definition_id, + @target_id AS cohort_definition_id, cohort_start_date, cohort_end_date FROM @cohort_database_schema.@cohort_table cohort_table {@cdm_version == "4"} ? { - WHERE cohort_concept_id IN (@cohort_id) + WHERE cohort_concept_id IN (@target_id) } : { - WHERE cohort_definition_id IN (@cohort_id) + WHERE cohort_definition_id IN (@target_id) } ) raw_cohorts {@first_only} ? { diff --git a/inst/sql/sql_server/GetCohorts.sql b/inst/sql/sql_server/GetCohorts.sql index e9189b2b1..54d4423ea 100644 --- a/inst/sql/sql_server/GetCohorts.sql +++ b/inst/sql/sql_server/GetCohorts.sql @@ -3,9 +3,9 @@ SELECT cast(row_id as int) row_id, subject_id, {@cdm_version == "4"} ? { - cohort_concept_id AS cohort_id, + cohort_concept_id AS target_id, } : { - cohort_definition_id AS cohort_id, + cohort_definition_id AS target_id, } cohort_start_date, days_from_obs_start, diff --git a/inst/sql/sql_server/PlpResultTables.sql b/inst/sql/sql_server/PlpResultTables.sql index 11597b82d..feb5cf2ed 100644 --- a/inst/sql/sql_server/PlpResultTables.sql +++ b/inst/sql/sql_server/PlpResultTables.sql @@ -1,31 +1,39 @@ --- do we need this -CREATE TABLE @my_schema.@string_to_appendstudies ( - study_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - study_name varchar(100), - study_description varchar(1000) +-- this links the PLP cohort_definition_id to the COHORT_DEFINITION +CREATE TABLE @my_schema.@string_to_appendcohorts ( + cohort_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, -- + cohort_definition_id int NOT NULL, -- the atlas id check type + cohort_name VARCHAR(MAX) NOT NULL ); -CREATE TABLE @my_schema.@string_to_appendcohorts ( - cohort_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - atlas_id bigint, - cohort_name char(100) NOT NULL, - cohort_json VARCHAR(MAX) NOT NULL +-- NEW - needs to match cohort generator COHORT_DEFINITION +CREATE TABLE @my_schema.@string_to_appendCOHORT_DEFINITION ( + cohort_definition_id int, -- check type + cohort_name VARCHAR(MAX) NOT NULL, + description VARCHAR(MAX), + json VARCHAR(MAX), + sql_command VARCHAR(MAX) ); -CREATE TABLE @my_schema.@string_to_appendresearchers ( - researcher_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - researcher_name char(100) NOT NULL, - researcher_email char(100) NOT NULL, - researcher_affiliation char(250) NOT NULL +-- link the database_id in the results with the database_meta_data_id +CREATE TABLE @my_schema.@string_to_appenddatabase_details ( -- DATABASE_META_DATA + database_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + database_meta_data_id varchar(MAX) -- databaseId strategus ); -CREATE TABLE @my_schema.@string_to_appenddatabase_details ( - database_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - database_name char(100) NOT NULL, - database_acronym char(20) NOT NULL, - database_version int NOT NULL, - database_description char(1000) NOT NULL, - database_type char(20) NOT NULL +-- NEW - needs to match stragegus DATABASE_META_DATA +CREATE TABLE @my_schema.@string_to_appendDATABASE_META_DATA ( + database_id varchar(MAX) PRIMARY KEY, + cdm_source_name varchar(MAX) NOT NULL, + cdm_source_abbreviation varchar(MAX) NOT NULL, + CDM_HOLDER varchar(MAX), + SOURCE_DESCRIPTION varchar(MAX), + SOURCE_DOCUMENTATION_REFERENCE varchar(MAX), + CDM_ETL_REFERENCE varchar(MAX), + SOURCE_RELEASE_DATE varchar(MAX), -- not date due to sqlite and consistency + CDM_RELEASE_DATE varchar(MAX), -- not date due to sqlite and consistency + CDM_VERSION varchar(MAX), + VOCABULARY_VERSION varchar(MAX), + MAX_OBS_PERIOD_END_DATE varchar(MAX) -- not date due to sqlite and consistency ); CREATE TABLE @my_schema.@string_to_appendtars ( @@ -91,11 +99,9 @@ CREATE TABLE @my_schema.@string_to_appendmodel_designs ( split_setting_id int NOT NULL, -- new feature_engineering_setting_id int NOT NULL, -- new tidy_covariates_setting_id int NOT NULL, -- new - researcher_id int NOT NULL, FOREIGN KEY (target_id) REFERENCES @my_schema.@string_to_appendcohorts(cohort_id), FOREIGN KEY (outcome_id) REFERENCES @my_schema.@string_to_appendcohorts(cohort_id), - FOREIGN KEY (researcher_id) REFERENCES @my_schema.@string_to_appendresearchers(researcher_id), - FOREIGN KEY (tar_id) REFERENCES @my_schema.@string_to_appendtars(tar_id), + FOREIGN KEY (tar_id) REFERENCES @my_schema.@string_to_appendtars(tar_id), FOREIGN KEY (population_setting_id) REFERENCES @my_schema.@string_to_appendpopulation_settings(population_setting_id), FOREIGN KEY (model_setting_id) REFERENCES @my_schema.@string_to_appendmodel_settings(model_setting_id), FOREIGN KEY (covariate_setting_id) REFERENCES @my_schema.@string_to_appendcovariate_settings(covariate_setting_id), @@ -106,45 +112,72 @@ CREATE TABLE @my_schema.@string_to_appendmodel_designs ( FOREIGN KEY (tidy_covariates_setting_id) REFERENCES @my_schema.@string_to_appendtidy_covariates_settings(tidy_covariates_setting_id) -- new ); --- diagnostics holder (will add more tables) +-- diagnostics holder CREATE TABLE @my_schema.@string_to_appenddiagnostics( diagnostic_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, model_design_id int, - researcher_id int, database_id int NOT NULL, - execution_date_time DATETIME2, - FOREIGN KEY (researcher_id) REFERENCES @my_schema.@string_to_appendresearchers(researcher_id), + execution_date_time VARCHAR(100), FOREIGN KEY (model_design_id) REFERENCES @my_schema.@string_to_appendmodel_designs(model_design_id), FOREIGN KEY (database_id) REFERENCES @my_schema.@string_to_appenddatabase_details(database_id) ); - +CREATE TABLE @my_schema.@string_to_appenddiagnostic_summary( + diagnostic_id int NOT NULL, + probast_id varchar(50), + result_value varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @my_schema.@string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_predictors( -- call this kmplot + diagnostic_id int NOT NULL, + days_to_event int, + outcome_at_time int, + observed_at_start_of_day bigint, + probast_id varchar(50), + input_type varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @my_schema.@string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_participants( + diagnostic_id int NOT NULL, + design varchar(50), + metric varchar(50), + value float, + probast_id varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @my_schema.@string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_outcomes( + diagnostic_id int NOT NULL, + xvalue int, + outcome_percent float, + aggregation varchar(50), + probast_id varchar(50), + input_type varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @my_schema.@string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_designs( + diagnostic_id int NOT NULL, + probast_id varchar(50), + value varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @my_schema.@string_to_appenddiagnostics(diagnostic_id) +); +-- end diagnostics -- results CREATE TABLE @my_schema.@string_to_appendmodels( model_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - researcher_id int, analysis_id varchar(50), model_design_id int, database_id int NOT NULL, - hyper_param_search VARCHAR(MAX), -- new this contains the hyperparameter performances - plp_model_file char(50) NOT NULL, - execution_date_time DATETIME2, + model_type VARCHAR(50), + plp_model_file text NOT NULL, -- reference to saved model location + train_details VARCHAR(MAX), -- new this contains all the trainDetails + preprocessing VARCHAR(MAX), -- new this contains the preprocessing required + execution_date_time VARCHAR(100), training_time VARCHAR(100), -- previously new intercept float, - require_dense_matrix char(1), -- new - FOREIGN KEY (researcher_id) REFERENCES @my_schema.@string_to_appendresearchers(researcher_id), FOREIGN KEY (model_design_id) REFERENCES @my_schema.@string_to_appendmodel_designs(model_design_id), FOREIGN KEY (database_id) REFERENCES @my_schema.@string_to_appenddatabase_details(database_id) ); --- do we need this? -CREATE TABLE @my_schema.@string_to_appendstudy_models ( - study_id int, - model_id int, - FOREIGN KEY (study_id) REFERENCES @my_schema.@string_to_appendstudies(study_id), - FOREIGN KEY (model_id) REFERENCES @my_schema.@string_to_appendmodels(model_id) -); - -- make this relcaibration specific? CREATE TABLE @my_schema.@string_to_appendrecalibrations ( recalibration_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, @@ -156,42 +189,44 @@ CREATE TABLE @my_schema.@string_to_appendrecalibrations ( FOREIGN KEY (recalibrated_model_id) REFERENCES @my_schema.@string_to_appendmodels(model_id) ); -CREATE TABLE @my_schema.@string_to_appendresults ( - result_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - model_id int NOT NULL, - researcher_id int NOT NULL, - database_id int NOT NULL, +CREATE TABLE @my_schema.@string_to_appendperformances ( + performance_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + model_design_id int NOT NULL, + development_database_id int NOT NULL, + validation_database_id int NOT NULL, target_id int NOT NULL, outcome_id int NOT NULL, tar_id int NOT NULL, plp_data_setting_id int NOT NULL, -- added population_setting_id int NOT NULL, - execution_date_time DATETIME2, + model_development int NOT NULL, -- added + execution_date_time VARCHAR(100), plp_version char(10), - FOREIGN KEY (model_id) REFERENCES @my_schema.@string_to_appendmodels(model_id), - FOREIGN KEY (researcher_id) REFERENCES @my_schema.@string_to_appendresearchers(researcher_id), - FOREIGN KEY (database_id) REFERENCES @my_schema.@string_to_appenddatabase_details(database_id), + FOREIGN KEY (model_design_id) REFERENCES @my_schema.@string_to_appendmodel_designs(model_design_id), + FOREIGN KEY (development_database_id) REFERENCES @my_schema.@string_to_appenddatabase_details(database_id), + FOREIGN KEY (validation_database_id) REFERENCES @my_schema.@string_to_appenddatabase_details(database_id), FOREIGN KEY (target_id) REFERENCES @my_schema.@string_to_appendcohorts(cohort_id), FOREIGN KEY (outcome_id) REFERENCES @my_schema.@string_to_appendcohorts(cohort_id), FOREIGN KEY (tar_id) REFERENCES @my_schema.@string_to_appendtars(tar_id), FOREIGN KEY (plp_data_setting_id) REFERENCES @my_schema.@string_to_appendplp_data_settings(plp_data_setting_id), -- new FOREIGN KEY (population_setting_id) REFERENCES @my_schema.@string_to_appendpopulation_settings(population_setting_id) + ); -- new CREATE TABLE @my_schema.@string_to_appendattrition ( - result_id int NOT NULL, + performance_id int NOT NULL, outcome_id int, description varchar(1000), - target_count int, + target_count int, -- is this still target? unique_people int, outcomes int, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); CREATE TABLE @my_schema.@string_to_appendprediction_distribution ( --distribution_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - result_id int NOT NULL, + performance_id int NOT NULL, evaluation char(10), class_label int, person_count int, @@ -204,12 +239,12 @@ CREATE TABLE @my_schema.@string_to_appendprediction_distribution ( p_75_predicted_probability float, p_95_predicted_probability float, max_predicted_probability float, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); CREATE TABLE @my_schema.@string_to_appendcovariate_summary( --cov_sum_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - result_id int NOT NULL, + performance_id int NOT NULL, covariate_id bigint NOT NULL, covariate_name VARCHAR(1000) NOT NULL, concept_id int NOT NULL, @@ -224,11 +259,11 @@ CREATE TABLE @my_schema.@string_to_appendcovariate_summary( with_outcome_covariate_mean float NOT NULL, with_outcome_covariate_st_dev float NOT NULL, standardized_mean_diff float NOT NULL, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); CREATE TABLE @my_schema.@string_to_appendthreshold_summary( --threshold_summary_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - result_id int NOT NULL, + performance_id int NOT NULL, evaluation char(10), prediction_threshold float, preference_threshold float, @@ -253,12 +288,12 @@ CREATE TABLE @my_schema.@string_to_appendthreshold_summary( positive_likelihood_ratio float, negative_likelihood_ratio float, diagnostic_odds_ratio float, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); CREATE TABLE @my_schema.@string_to_appendcalibration_summary( --calibration_summary_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - result_id int NOT NULL, + performance_id int NOT NULL, evaluation char(10), prediction_threshold float, person_count_at_risk int, @@ -271,21 +306,21 @@ CREATE TABLE @my_schema.@string_to_appendcalibration_summary( p_75_predicted_probability float, max_predicted_probability float, observed_incidence float, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); CREATE TABLE @my_schema.@string_to_appendevaluation_statistics ( --evaluation_stat_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - result_id int NOT NULL, + performance_id int NOT NULL, evaluation char(10), metric varchar(50), value float, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); CREATE TABLE @my_schema.@string_to_appenddemographic_summary( --demographic_summary_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, - result_id int NOT NULL, + performance_id int NOT NULL, evaluation char(10), age_group char(20), gen_group char(20), @@ -298,5 +333,6 @@ CREATE TABLE @my_schema.@string_to_appenddemographic_summary( p_50_predicted_probability float, p_75_predicted_probability float, max_predicted_probability float, - FOREIGN KEY (result_id) REFERENCES @my_schema.@string_to_appendresults(result_id) + FOREIGN KEY (performance_id) REFERENCES @my_schema.@string_to_appendperformances(performance_id) ); + diff --git a/inst/sql/sql_server/UpdateVersionNumber.sql b/inst/sql/sql_server/UpdateVersionNumber.sql new file mode 100644 index 000000000..216233f88 --- /dev/null +++ b/inst/sql/sql_server/UpdateVersionNumber.sql @@ -0,0 +1,5 @@ +{DEFAULT @package_version = package_version} +{DEFAULT @version_number = '6.0.10'} + +DELETE FROM @database_schema.@table_prefix@package_version; +INSERT INTO @database_schema.@table_prefix@package_version (version_number) VALUES ('@version_number'); diff --git a/inst/sql/sql_server/migrations/Migration_1-store_version.sql b/inst/sql/sql_server/migrations/Migration_1-store_version.sql new file mode 100644 index 000000000..bd4ab3a61 --- /dev/null +++ b/inst/sql/sql_server/migrations/Migration_1-store_version.sql @@ -0,0 +1,16 @@ +-- Database migrations for verion 6.0.10 +-- This migration updates the schema: + -- 1. to store the patient level prediction version + -- 2. Add a migrations table for supporting database migrations + +{DEFAULT @package_version = package_version} +{DEFAULT @migration = migration} +{DEFAULT @table_prefix = ''} + +-- Create table indicating version number of ddl +DROP TABLE IF EXISTS @database_schema.@table_prefix@package_version; + +--HINT DISTRIBUTE ON RANDOM +CREATE TABLE @database_schema.@table_prefix@package_version ( + version_number VARCHAR(50) PRIMARY KEY +); \ No newline at end of file diff --git a/inst/sql/sqlite/PlpResultTables.sql b/inst/sql/sqlite/PlpResultTables.sql new file mode 100644 index 000000000..93bd63ac5 --- /dev/null +++ b/inst/sql/sqlite/PlpResultTables.sql @@ -0,0 +1,338 @@ +-- this links the PLP cohort_definition_id to the COHORT_DEFINITION +CREATE TABLE @my_schema.@string_to_appendcohorts ( + cohort_id INTEGER PRIMARY KEY AUTOINCREMENT, -- + cohort_definition_id int NOT NULL, -- the atlas id check type + cohort_name VARCHAR(MAX) NOT NULL +); + +-- NEW - needs to match cohort generator COHORT_DEFINITION +CREATE TABLE @my_schema.@string_to_appendCOHORT_DEFINITION ( + cohort_definition_id int NOT NULL, -- check type + cohort_name VARCHAR(MAX) NOT NULL, + description VARCHAR(MAX), + json VARCHAR(MAX), + sql_command VARCHAR(MAX) +); + +-- link the database_id in the results with the database_meta_data_id +CREATE TABLE @my_schema.@string_to_appenddatabase_details ( -- DATABASE_META_DATA + database_id INTEGER PRIMARY KEY AUTOINCREMENT, + database_meta_data_id varchar(MAX) -- databaseId strategus +); + +-- NEW - needs to match stragegus DATABASE_META_DATA +CREATE TABLE @my_schema.@string_to_appendDATABASE_META_DATA ( + database_id varchar(MAX) PRIMARY KEY, + cdm_source_name varchar(MAX) NOT NULL, + cdm_source_abbreviation varchar(MAX) NOT NULL, + CDM_HOLDER varchar(MAX), + SOURCE_DESCRIPTION varchar(MAX), + SOURCE_DOCUMENTATION_REFERENCE varchar(MAX), + CDM_ETL_REFERENCE varchar(MAX), + SOURCE_RELEASE_DATE varchar(MAX), -- not date due to sqlite and consistency + CDM_RELEASE_DATE varchar(MAX), -- not date due to sqlite and consistency + CDM_VERSION varchar(MAX), + VOCABULARY_VERSION varchar(MAX), + MAX_OBS_PERIOD_END_DATE varchar(MAX) -- not date due to sqlite and consistency +); + + +CREATE TABLE @my_schema.@string_to_appendtars ( + tar_id INTEGER PRIMARY KEY AUTOINCREMENT, + tar_start_day int NOT NULL, + tar_start_anchor VARCHAR(20) NOT NULL, + tar_end_day int NOT NULL, + tar_end_anchor VARCHAR(20) NOT NULL +); + +CREATE TABLE @my_schema.@string_to_appendpopulation_settings( + population_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + population_settings_json VARCHAR(MAX) NOT NULL +); + +CREATE TABLE @my_schema.@string_to_appendcovariate_settings( + covariate_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + covariate_settings_json VARCHAR(MAX) NOT NULL +); + +CREATE TABLE @my_schema.@string_to_appendmodel_settings( + model_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + model_type VARCHAR(50), + model_settings_json VARCHAR(MAX) +); + +CREATE TABLE @my_schema.@string_to_appendsplit_settings( -- was training_settings + split_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + split_settings_json VARCHAR(MAX) +); + +CREATE TABLE @my_schema.@string_to_appendplp_data_settings( -- new + plp_data_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + plp_data_settings_json VARCHAR(MAX) +); + +CREATE TABLE @my_schema.@string_to_appendfeature_engineering_settings( -- new + feature_engineering_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + feature_engineering_settings_json VARCHAR(MAX) +); + +CREATE TABLE @my_schema.@string_to_appendtidy_covariates_settings( -- new + tidy_covariates_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + tidy_covariates_settings_json VARCHAR(MAX) +); + +CREATE TABLE @my_schema.@string_to_appendsample_settings( -- new + sample_setting_id INTEGER PRIMARY KEY AUTOINCREMENT, + sample_settings_json VARCHAR(MAX) +); + +CREATE TABLE @my_schema.@string_to_appendmodel_designs ( + model_design_id INTEGER PRIMARY KEY AUTOINCREMENT, + --model_name CHAR(50) NOT NULL, + target_id int NOT NULL, + outcome_id int NOT NULL, + tar_id int NOT NULL, + plp_data_setting_id int NOT NULL, -- new + population_setting_id int NOT NULL, + model_setting_id int NOT NULL, + covariate_setting_id int NOT NULL, + sample_setting_id int NOT NULL, -- new + split_setting_id int NOT NULL, -- new + feature_engineering_setting_id int NOT NULL, -- new + tidy_covariates_setting_id int NOT NULL, -- new + FOREIGN KEY (target_id) REFERENCES @string_to_appendcohorts(cohort_id), + FOREIGN KEY (outcome_id) REFERENCES @string_to_appendcohorts(cohort_id), + FOREIGN KEY (tar_id) REFERENCES @string_to_appendtars(tar_id), + FOREIGN KEY (population_setting_id) REFERENCES @string_to_appendpopulation_settings(population_setting_id), + FOREIGN KEY (model_setting_id) REFERENCES @string_to_appendmodel_settings(model_setting_id), + FOREIGN KEY (covariate_setting_id) REFERENCES @string_to_appendcovariate_settings(covariate_setting_id), + FOREIGN KEY (sample_setting_id) REFERENCES @string_to_appendsample_settings(sample_setting_id), -- new + FOREIGN KEY (split_setting_id) REFERENCES @string_to_appendsplit_settings(split_setting_id), -- new + FOREIGN KEY (plp_data_setting_id) REFERENCES @string_to_appendplp_data_settings(plp_data_setting_id), -- new + FOREIGN KEY (feature_engineering_setting_id) REFERENCES @string_to_appendfeature_engineering_settings(feature_engineering_setting_id), -- new + FOREIGN KEY (tidy_covariates_setting_id) REFERENCES @string_to_appendtidy_covariates_settings(tidy_covariates_setting_id) -- new +); + +-- diagnostics holder +CREATE TABLE @my_schema.@string_to_appenddiagnostics( + diagnostic_id INTEGER PRIMARY KEY AUTOINCREMENT, + model_design_id int, + database_id int NOT NULL, + execution_date_time VARCHAR(100), + FOREIGN KEY (model_design_id) REFERENCES @string_to_appendmodel_designs(model_design_id), + FOREIGN KEY (database_id) REFERENCES @string_to_appenddatabase_details(database_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_summary( + diagnostic_id int NOT NULL, + probast_id varchar(50), + result_value varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_predictors( -- call this kmplot + diagnostic_id int NOT NULL, + days_to_event int, + outcome_at_time int, + observed_at_start_of_day bigint, + probast_id varchar(50), + input_type varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_participants( + diagnostic_id int NOT NULL, + design varchar(50), + metric varchar(50), + value float, + probast_id varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_outcomes( + diagnostic_id int NOT NULL, + xvalue int, + outcome_percent float, + aggregation varchar(50), + probast_id varchar(50), + input_type varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @string_to_appenddiagnostics(diagnostic_id) +); +CREATE TABLE @my_schema.@string_to_appenddiagnostic_designs( + diagnostic_id int NOT NULL, + probast_id varchar(50), + value varchar(50), + FOREIGN KEY (diagnostic_id) REFERENCES @string_to_appenddiagnostics(diagnostic_id) +); +-- end diagnostics + +-- results +CREATE TABLE @my_schema.@string_to_appendmodels( + model_id INTEGER PRIMARY KEY AUTOINCREMENT, + analysis_id varchar(50), + model_design_id int, + database_id int NOT NULL, + model_type VARCHAR(50), + plp_model_file VARCHAR(MAX) NOT NULL, -- reference to saved model location + train_details VARCHAR(MAX), -- new this contains all the trainDetails + preprocessing VARCHAR(MAX), -- new this contains the preprocessing required + execution_date_time VARCHAR(100), + training_time VARCHAR(100), -- previously new + intercept float, + FOREIGN KEY (model_design_id) REFERENCES @string_to_appendmodel_designs(model_design_id), + FOREIGN KEY (database_id) REFERENCES @string_to_appenddatabase_details(database_id) +); + +-- make this relcaibration specific? +CREATE TABLE @my_schema.@string_to_appendrecalibrations ( + recalibration_id INTEGER PRIMARY KEY AUTOINCREMENT, + original_model_id int NOT NULL, + recalibrated_model_id int NOT NULL, + recalibration_type varchar(15), + recalibration_json varchar(MAX), + FOREIGN KEY (original_model_id) REFERENCES @string_to_appendmodels(model_id), + FOREIGN KEY (recalibrated_model_id) REFERENCES @string_to_appendmodels(model_id) +); + +CREATE TABLE @my_schema.@string_to_appendperformances ( + performance_id INTEGER PRIMARY KEY AUTOINCREMENT, + model_design_id int NOT NULL, + development_database_id int NOT NULL, + validation_database_id int NOT NULL, + target_id int NOT NULL, + outcome_id int NOT NULL, + tar_id int NOT NULL, + plp_data_setting_id int NOT NULL, -- added + population_setting_id int NOT NULL, + model_development int NOT NULL, -- added + execution_date_time VARCHAR(100), + plp_version char(10), + FOREIGN KEY (model_design_id) REFERENCES @string_to_appendmodels_designs(model_design_id), + FOREIGN KEY (development_database_id) REFERENCES @string_to_appenddatabase_details(database_id), + FOREIGN KEY (validation_database_id) REFERENCES @string_to_appenddatabase_details(database_id), + FOREIGN KEY (target_id) REFERENCES @string_to_appendcohorts(cohort_id), + FOREIGN KEY (outcome_id) REFERENCES @string_to_appendcohorts(cohort_id), + FOREIGN KEY (tar_id) REFERENCES @string_to_appendtars(tar_id), + FOREIGN KEY (plp_data_setting_id) REFERENCES @string_to_appendplp_data_settings(plp_data_setting_id), -- new + FOREIGN KEY (population_setting_id) REFERENCES @string_to_appendpopulation_settings(population_setting_id) +); + +-- new +CREATE TABLE @my_schema.@string_to_appendattrition ( + performance_id int NOT NULL, + outcome_id int, + description varchar(1000), + target_count int, -- is this still target? + unique_people int, + outcomes int, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); + +CREATE TABLE @my_schema.@string_to_appendprediction_distribution ( + --distribution_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + performance_id int NOT NULL, + evaluation VARCHAR(10), + class_label int, + person_count int, + average_predicted_probability float, + st_dev_predicted_probability float, + min_predicted_probability float, + p_05_predicted_probability float, + p_25_predicted_probability float, + median_predicted_probability float, + p_75_predicted_probability float, + p_95_predicted_probability float, + max_predicted_probability float, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); + +CREATE TABLE @my_schema.@string_to_appendcovariate_summary( + --cov_sum_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + performance_id int NOT NULL, + covariate_id bigint NOT NULL, + covariate_name VARCHAR(1000) NOT NULL, + concept_id int NOT NULL, + covariate_value float NOT NULL, + covariate_count int NOT NULL, + covariate_mean float NOT NULL, + covariate_st_dev float NOT NULL, + with_no_outcome_covariate_count int NOT NULL, + with_no_outcome_covariate_mean float NOT NULL, + with_no_outcome_covariate_st_dev float NOT NULL, + with_outcome_covariate_count int NOT NULL, + with_outcome_covariate_mean float NOT NULL, + with_outcome_covariate_st_dev float NOT NULL, + standardized_mean_diff float NOT NULL, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); +CREATE TABLE @my_schema.@string_to_appendthreshold_summary( + --threshold_summary_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + performance_id int NOT NULL, + evaluation VARCHAR(10), + prediction_threshold float, + preference_threshold float, + positive_count int, + negative_count int, + true_count int, + false_count int, + true_positive_count int, + true_negative_count int, + false_positive_count int, + false_negative_count int, + f_1_score float, + accuracy float, + sensitivity float, + false_negative_rate float, + false_positive_rate float, + specificity float, + positive_predictive_value float, + false_discovery_rate float, + negative_predictive_value float, + false_omission_rate float, + positive_likelihood_ratio float, + negative_likelihood_ratio float, + diagnostic_odds_ratio float, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); + +CREATE TABLE @my_schema.@string_to_appendcalibration_summary( + --calibration_summary_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + performance_id int NOT NULL, + evaluation VARCHAR(10), + prediction_threshold float, + person_count_at_risk int, + person_count_with_outcome int, + average_predicted_probability float, + st_dev_predicted_probability float, + min_predicted_probability float, + p_25_predicted_probability float, + median_predicted_probability float, + p_75_predicted_probability float, + max_predicted_probability float, + observed_incidence float, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); + +CREATE TABLE @my_schema.@string_to_appendevaluation_statistics ( + --evaluation_stat_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + performance_id int NOT NULL, + evaluation VARCHAR(10), + metric varchar(50), + value float, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); + +CREATE TABLE @my_schema.@string_to_appenddemographic_summary( + --demographic_summary_id int GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY, + performance_id int NOT NULL, + evaluation VARCHAR(10), + age_group VARCHAR(20), + gen_group VARCHAR(20), + person_count_at_risk int, + person_count_with_outcome int, + average_predicted_probability float, + st_dev_predicted_probability float, + min_predicted_probability float, + p_25_predicted_probability float, + p_50_predicted_probability float, + p_75_predicted_probability float, + max_predicted_probability float, + FOREIGN KEY (performance_id) REFERENCES @string_to_appendperformances(performance_id) +); + diff --git a/inst/sql/sqlite/migrations/Migration_1-store_version.sql b/inst/sql/sqlite/migrations/Migration_1-store_version.sql new file mode 100644 index 000000000..bd4ab3a61 --- /dev/null +++ b/inst/sql/sqlite/migrations/Migration_1-store_version.sql @@ -0,0 +1,16 @@ +-- Database migrations for verion 6.0.10 +-- This migration updates the schema: + -- 1. to store the patient level prediction version + -- 2. Add a migrations table for supporting database migrations + +{DEFAULT @package_version = package_version} +{DEFAULT @migration = migration} +{DEFAULT @table_prefix = ''} + +-- Create table indicating version number of ddl +DROP TABLE IF EXISTS @database_schema.@table_prefix@package_version; + +--HINT DISTRIBUTE ON RANDOM +CREATE TABLE @database_schema.@table_prefix@package_version ( + version_number VARCHAR(50) PRIMARY KEY +); \ No newline at end of file diff --git a/man/MapIds.Rd b/man/MapIds.Rd new file mode 100644 index 000000000..e10d16ba3 --- /dev/null +++ b/man/MapIds.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Formatting.R +\name{MapIds} +\alias{MapIds} +\title{Map covariate and row Ids so they start from 1} +\usage{ +MapIds(covariateData, cohort = NULL, mapping = NULL) +} +\arguments{ +\item{covariateData}{a covariateData object} + +\item{cohort}{if specified rowIds restricted to the ones in cohort} + +\item{mapping}{A pre defined mapping to use} +} +\description{ +this functions takes covariate data and a cohort/population and remaps +the covariate and row ids, restricts to pop and saves/creates mapping +} diff --git a/man/PatientLevelPrediction.Rd b/man/PatientLevelPrediction.Rd index 560e9be95..8bc15fc71 100644 --- a/man/PatientLevelPrediction.Rd +++ b/man/PatientLevelPrediction.Rd @@ -2,8 +2,32 @@ % Please edit documentation in R/PatientLevelPrediction.R \docType{package} \name{PatientLevelPrediction} +\alias{PatientLevelPrediction-package} \alias{PatientLevelPrediction} \title{PatientLevelPrediction} \description{ A package for running predictions using data in the OMOP CDM } +\seealso{ +Useful links: +\itemize{ + \item \url{https://ohdsi.github.io/PatientLevelPrediction} + \item \url{https://github.com/OHDSI/PatientLevelPrediction} + \item Report bugs at \url{https://github.com/OHDSI/PatientLevelPrediction/issues} +} + +} +\author{ +\strong{Maintainer}: Jenna Reps \email{jreps@its.jnj.com} + +Authors: +\itemize{ + \item Martijn Schuemie + \item Marc Suchard + \item Patrick Ryan + \item Peter Rijnbeek + \item Egill Fridgeirsson +} + +} +\keyword{internal} diff --git a/man/addDiagnosePlpToDatabase.Rd b/man/addDiagnosePlpToDatabase.Rd new file mode 100644 index 000000000..483575654 --- /dev/null +++ b/man/addDiagnosePlpToDatabase.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabaseDiagnostics.R +\name{addDiagnosePlpToDatabase} +\alias{addDiagnosePlpToDatabase} +\title{Insert a diagnostic result into a PLP result schema database} +\usage{ +addDiagnosePlpToDatabase( + diagnosePlp, + connectionDetails, + databaseSchemaSettings, + cohortDefinitions, + databaseList = NULL, + overWriteIfExists = T +) +} +\arguments{ +\item{diagnosePlp}{An object of class \code{diagnosePlp}} + +\item{connectionDetails}{A connection details created by using the +function \code{createConnectionDetails} in the +\code{DatabaseConnector} package.} + +\item{databaseSchemaSettings}{A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables} + +\item{cohortDefinitions}{A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()} + +\item{databaseList}{(Optional) If you wish to overwrite the settings in the plp object use \code{createdatabaseList} to specify the databases} + +\item{overWriteIfExists}{(default: T) Whether to delete existing results and overwrite them} +} +\value{ +Returns NULL but uploads the diagnostic into the database schema specified in databaseSchemaSettings +} +\description{ +This function inserts a diagnostic result into the result schema +} +\details{ +This function can be used to upload a diagnostic result into a database +} diff --git a/man/addMultipleDiagnosePlpToDatabase.Rd b/man/addMultipleDiagnosePlpToDatabase.Rd new file mode 100644 index 000000000..3d679c026 --- /dev/null +++ b/man/addMultipleDiagnosePlpToDatabase.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabaseDiagnostics.R +\name{addMultipleDiagnosePlpToDatabase} +\alias{addMultipleDiagnosePlpToDatabase} +\title{Insert mutliple diagnosePlp results saved to a directory into a PLP result schema database} +\usage{ +addMultipleDiagnosePlpToDatabase( + connectionDetails, + databaseSchemaSettings, + cohortDefinitions, + databaseList = NULL, + resultLocation +) +} +\arguments{ +\item{connectionDetails}{A connection details created by using the +function \code{createConnectionDetails} in the +\code{DatabaseConnector} package.} + +\item{databaseSchemaSettings}{A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables} + +\item{cohortDefinitions}{(list) A list of cohortDefinitions (each list must contain: name, id)} + +\item{databaseList}{(Optional) ...} + +\item{resultLocation}{The location of the diagnostic results} +} +\value{ +Returns NULL but uploads multiple diagnosePlp results into the database schema specified in databaseSchemaSettings +} +\description{ +This function inserts diagnosePlp results into the result schema +} +\details{ +This function can be used to upload diagnosePlp results into a database +} diff --git a/man/addMultipleRunPlpToDatabase.Rd b/man/addMultipleRunPlpToDatabase.Rd new file mode 100644 index 000000000..696fc81b3 --- /dev/null +++ b/man/addMultipleRunPlpToDatabase.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabase.R +\name{addMultipleRunPlpToDatabase} +\alias{addMultipleRunPlpToDatabase} +\title{Populate the PatientLevelPrediction results tables} +\usage{ +addMultipleRunPlpToDatabase( + connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"), + cohortDefinitions, + databaseList = NULL, + resultLocation = NULL, + resultLocationVector, + modelSaveLocation +) +} +\arguments{ +\item{connectionDetails}{A connection details created by using the +function \code{createConnectionDetails} in the +\code{DatabaseConnector} package.} + +\item{databaseSchemaSettings}{A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables} + +\item{cohortDefinitions}{A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()} + +\item{databaseList}{(Optional) A list created by \code{createDatabaseList} to specify the databases} + +\item{resultLocation}{(string) location of directory where the main package results were saved} + +\item{resultLocationVector}{(only used when resultLocation is missing) a vector of locations with development or validation results} + +\item{modelSaveLocation}{The location of the file system for saving the models in a subdirectory} +} +\value{ +Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema +} +\description{ +This function formats and uploads results that have been generated via an ATLAS prediction package into a database +} +\details{ +This function can be used upload PatientLevelPrediction results into a database +} diff --git a/man/addRunPlpToDatabase.Rd b/man/addRunPlpToDatabase.Rd new file mode 100644 index 000000000..c77dc3eaf --- /dev/null +++ b/man/addRunPlpToDatabase.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabase.R +\name{addRunPlpToDatabase} +\alias{addRunPlpToDatabase} +\title{Function to add the run plp (development or validation) to database} +\usage{ +addRunPlpToDatabase( + runPlp, + connectionDetails, + databaseSchemaSettings, + cohortDefinitions, + modelSaveLocation, + databaseList = NULL +) +} +\arguments{ +\item{runPlp}{An object of class \code{runPlp} or class \code{externalValidatePlp}} + +\item{connectionDetails}{A connection details created by using the +function \code{createConnectionDetails} in the +\code{DatabaseConnector} package.} + +\item{databaseSchemaSettings}{A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables} + +\item{cohortDefinitions}{A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()} + +\item{modelSaveLocation}{The location of the directory that models will be saved to} + +\item{databaseList}{(Optional) If you want to change the database name then used \code{createDatabaseList} to specify the database settings but use the same cdmDatabaseId was model development/validation} +} +\value{ +Returns a data.frame with the database details +} +\description{ +This function adds a runPlp or external validation result into a database +} +\details{ +This function is used when inserting results into the PatientLevelPrediction database results schema +} diff --git a/man/computeGridPerformance.Rd b/man/computeGridPerformance.Rd new file mode 100644 index 000000000..35a1b9ee2 --- /dev/null +++ b/man/computeGridPerformance.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SklearnClassifier.R +\name{computeGridPerformance} +\alias{computeGridPerformance} +\title{Computes grid performance with a specified performance function} +\usage{ +computeGridPerformance(prediction, param, performanceFunct = "computeAuc") +} +\arguments{ +\item{prediction}{a dataframe with predictions and outcomeCount per rowId} + +\item{param}{a list of hyperparameters} + +\item{performanceFunct}{a string specifying which performance function to use +. Default ``'compute_AUC'``} +} +\value{ +A list with overview of the performance +} +\description{ +Computes grid performance with a specified performance function +} diff --git a/man/configurePython.Rd b/man/configurePython.Rd index 9d014fa94..99e8d79aa 100644 --- a/man/configurePython.Rd +++ b/man/configurePython.Rd @@ -4,12 +4,14 @@ \alias{configurePython} \title{Sets up a virtual environment to use for PLP (can be conda or python)} \usage{ -configurePython(envname = "PLP", envtype = NULL) +configurePython(envname = "PLP", envtype = NULL, condaPythonVersion = "3.11") } \arguments{ \item{envname}{A string for the name of the virtual environment (default is 'PLP')} \item{envtype}{An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users} + +\item{condaPythonVersion}{String, Python version to use when creating a conda environment} } \description{ Sets up a virtual environment to use for PLP (can be conda or python) diff --git a/man/createDatabaseDetails.Rd b/man/createDatabaseDetails.Rd index fdcd1ae40..f848843cd 100644 --- a/man/createDatabaseDetails.Rd +++ b/man/createDatabaseDetails.Rd @@ -8,14 +8,16 @@ createDatabaseDetails( connectionDetails, cdmDatabaseSchema, cdmDatabaseName, + cdmDatabaseId, tempEmulationSchema = cdmDatabaseSchema, cohortDatabaseSchema = cdmDatabaseSchema, cohortTable = "cohort", outcomeDatabaseSchema = cdmDatabaseSchema, outcomeTable = "cohort", - cohortId = NULL, + targetId = NULL, outcomeIds = NULL, - cdmVersion = 5 + cdmVersion = 5, + cohortId = NULL ) } \arguments{ @@ -28,7 +30,9 @@ instance. Requires read permissions to this database. On SQL Server, this should specifiy both the database and the schema, so for example 'cdm_instance.dbo'.} -\item{cdmDatabaseName}{A string with a shareable name of the database (this will be shown to OHDSI researchers if the results get transported)} +\item{cdmDatabaseName}{A string with the name of the database - this is used in the shiny app and when externally validating models to name the result list and to specify the folder name when saving validation results (defaults to cdmDatabaseSchema if not specified)} + +\item{cdmDatabaseId}{A string with a unique identifier for the database and version - this is stored in the plp object for future reference and used by the shiny app (defaults to cdmDatabaseSchema if not specified)} \item{tempEmulationSchema}{For dmbs like Oracle only: the name of the database schema where you want all temporary tables to be managed. Requires @@ -50,12 +54,13 @@ this database.} outcomeTable has format of COHORT table: COHORT_DEFINITION_ID, SUBJECT_ID, COHORT_START_DATE, COHORT_END_DATE.} -\item{cohortId}{An integer specifying the cohort id for the target cohort} +\item{targetId}{An integer specifying the cohort id for the target cohort} \item{outcomeIds}{A single integer or vector of integers specifying the cohort ids for the outcome cohorts} -\item{cdmVersion}{Define the OMOP CDM version used: currently support "4" and -"5".} +\item{cdmVersion}{Define the OMOP CDM version used: currently support "4" and "5".} + +\item{cohortId}{(depreciated: use targetId) old input for the target cohort id} } \value{ A list with the the database specific settings (this is used by the runMultiplePlp function and the skeleton packages) diff --git a/man/createDatabaseList.Rd b/man/createDatabaseList.Rd new file mode 100644 index 000000000..4e9d5393c --- /dev/null +++ b/man/createDatabaseList.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabase.R +\name{createDatabaseList} +\alias{createDatabaseList} +\title{Create a list with the database details and database meta data entries} +\usage{ +createDatabaseList(cdmDatabaseSchemas, cdmDatabaseNames, databaseRefIds = NULL) +} +\arguments{ +\item{cdmDatabaseSchemas}{(string vector) A vector of the cdmDatabaseSchemas used in the study - if the schemas are not unique per database please also specify databaseRefId} + +\item{cdmDatabaseNames}{Sharable names for the databases} + +\item{databaseRefIds}{(string vector) Unique database identifiers - what you specified as cdmDatabaseId in \code{PatientLevelPrediction::createDatabaseDetails()} when developing the models} +} +\value{ +Returns a data.frame with the database details +} +\description{ +This function creates a list with the database details and database meta data entries used in the study +} +\details{ +This function is used when inserting database details into the PatientLevelPrediction database results schema +} diff --git a/man/createDatabaseSchemaSettings.Rd b/man/createDatabaseSchemaSettings.Rd new file mode 100644 index 000000000..cd58c7da2 --- /dev/null +++ b/man/createDatabaseSchemaSettings.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabase.R +\name{createDatabaseSchemaSettings} +\alias{createDatabaseSchemaSettings} +\title{Create the PatientLevelPrediction database result schema settings} +\usage{ +createDatabaseSchemaSettings( + resultSchema = "main", + tablePrefix = "", + targetDialect = "sqlite", + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + cohortDefinitionSchema = resultSchema, + tablePrefixCohortDefinitionTables = tablePrefix, + databaseDefinitionSchema = resultSchema, + tablePrefixDatabaseDefinitionTables = tablePrefix +) +} +\arguments{ +\item{resultSchema}{(string) The name of the database schema with the result tables.} + +\item{tablePrefix}{(string) A string that appends to the PatientLevelPrediction result tables} + +\item{targetDialect}{(string) The database management system being used} + +\item{tempEmulationSchema}{(string) The temp schema used when the database management system is oracle} + +\item{cohortDefinitionSchema}{(string) The name of the database schema with the cohort definition tables (defaults to resultSchema).} + +\item{tablePrefixCohortDefinitionTables}{(string) A string that appends to the cohort definition tables} + +\item{databaseDefinitionSchema}{(string) The name of the database schema with the database definition tables (defaults to resultSchema).} + +\item{tablePrefixDatabaseDefinitionTables}{(string) A string that appends to the database definition tables} +} +\value{ +Returns a list of class 'plpDatabaseResultSchema' with all the database settings +} +\description{ +This function specifies where the results schema is and lets you pick a different schema for the cohorts and databases +} +\details{ +This function can be used to specify the database settings used to upload PatientLevelPrediction results into a database +} diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd index a3b67f2e1..b9020bd45 100644 --- a/man/createDefaultSplitSetting.Rd +++ b/man/createDefaultSplitSetting.Rd @@ -3,7 +3,7 @@ \name{createDefaultSplitSetting} \alias{createDefaultSplitSetting} \title{Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting).} +default splitting functions (either random stratified by outcome, time or subject splitting)} \usage{ createDefaultSplitSetting( testFraction = 0.25, @@ -24,9 +24,9 @@ If not set train is equal to 1 - test} \item{nfold}{(numeric) An integer > 1 specifying the number of folds used in cross validation} \item{type}{(character) Choice of: \itemize{ -\item{'stratified'}{ Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition } -\item{'time')}{ Older data are assigned into the training set and newer data are assigned into the test set} -\item{'subject'}{ Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).} +\item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition +\item'time' Older data are assigned into the training set and newer data are assigned into the test set +\item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both). }} } \value{ @@ -34,7 +34,7 @@ An object of class \code{splitSettings} } \description{ Create the settings for defining how the plpData are split into test/validation/train sets using -default splitting functions (either random stratified by outcome, time or subject splitting). +default splitting functions (either random stratified by outcome, time or subject splitting) } \details{ Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings diff --git a/man/createFeatureEngineeringSettings.Rd b/man/createFeatureEngineeringSettings.Rd index c43b4cd2d..0b7a0a8d8 100644 --- a/man/createFeatureEngineeringSettings.Rd +++ b/man/createFeatureEngineeringSettings.Rd @@ -8,7 +8,7 @@ createFeatureEngineeringSettings(type = "none") } \arguments{ \item{type}{(character) Choice of: \itemize{ -\item{'none'}{ No feature engineering - this is the default } +\item'none' No feature engineering - this is the default }} } \value{ diff --git a/man/createLearningCurve.Rd b/man/createLearningCurve.Rd index b8013559f..e6c113a13 100644 --- a/man/createLearningCurve.Rd +++ b/man/createLearningCurve.Rd @@ -15,7 +15,7 @@ createLearningCurve( populationSettings = createStudyPopulationSettings(), splitSettings = createDefaultSplitSetting(), trainFractions = c(0.25, 0.5, 0.75), - trainEvents = c(500, 1000, 1500), + trainEvents = NULL, sampleSettings = createSampleSettings(), featureEngineeringSettings = createFeatureEngineeringSettings(), preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T), @@ -37,17 +37,12 @@ data extracted from the CDM.} \item{modelSettings}{An object of class \code{modelSettings} created using one of the function: \itemize{ -\item{setLassoLogisticRegression()}{ A lasso logistic regression model} -\item{setGradientBoostingMachine()}{ A gradient boosting machine} -\item{setAdaBoost()}{ An ada boost model} -\item{setRandomForest()}{ A random forest model} -\item{setDecisionTree()}{ A decision tree model} -\item{setCovNN())}{ A convolutional neural network model} -\item{setCIReNN()}{ A recurrent neural network model} -\item{setMLP()}{ A neural network model} -\item{setDeepNN()}{ A deep neural network model} -\item{setKNN()}{ A KNN model} - +\item \code{setLassoLogisticRegression()} A lasso logistic regression model +\item \code{setGradientBoostingMachine()} A gradient boosting machine +\item \code{setAdaBoost()} An ada boost model +\item \code{setRandomForest()} A random forest model +\item \code{setDecisionTree()} A decision tree model +\item \code{setKNN()} A KNN model }} \item{saveDirectory}{The path to the directory where the results will be saved (if NULL uses working directory)} @@ -70,7 +65,7 @@ Therefore, it is recommended to provide \code{trainEvents} rather than \code{trainFractions}. Note, providing \code{trainEvents} will override your input to \code{trainFractions}. The format should be as follows: \itemize{ - \item{ \code{c(500, 1000, 1500) } - a list of training events} + \item \code{c(500, 1000, 1500) } - a list of training events }} \item{sampleSettings}{An object of type \code{sampleSettings} that specifies any under/over sampling to be done. diff --git a/man/createLogSettings.Rd b/man/createLogSettings.Rd index 0ff82b6f1..83476c541 100644 --- a/man/createLogSettings.Rd +++ b/man/createLogSettings.Rd @@ -9,12 +9,12 @@ createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log") \arguments{ \item{verbosity}{Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are: \itemize{ -\item{DEBUG}{Highest verbosity showing all debug statements} -\item{TRACE}{Showing information about start and end of steps} -\item{INFO}{Show informative information (Default)} -\item{WARN}{Show warning messages} -\item{ERROR}{Show error messages} -\item{FATAL}{Be silent except for fatal errors} +\item DEBUG Highest verbosity showing all debug statements +\item TRACE Showing information about start and end of steps +\item INFO Show informative information (Default) +\item WARN Show warning messages +\item ERROR Show error messages +\item FATAL Be silent except for fatal errors }} \item{timeStamp}{If TRUE a timestamp will be added to each logging statement. Automatically switched on for TRACE level.} diff --git a/man/createModelDesign.Rd b/man/createModelDesign.Rd index dbae17f6e..593a0d8c9 100644 --- a/man/createModelDesign.Rd +++ b/man/createModelDesign.Rd @@ -14,6 +14,8 @@ createModelDesign( sampleSettings = NULL, preprocessSettings = NULL, modelSettings = NULL, + splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25, + trainFraction = 0.75, splitSeed = 123, nfold = 3), runCovariateSummary = T ) } @@ -36,6 +38,8 @@ createModelDesign( \item{modelSettings}{The model settings such as \code{setLassoLogisticRegression()}} +\item{splitSettings}{The train/validation/test splitting used by all analyses created using \code{createDefaultSplitSetting()}} + \item{runCovariateSummary}{Whether to run the covariateSummary} } \value{ diff --git a/man/createPlpResultTables.Rd b/man/createPlpResultTables.Rd index 765b870cb..8bdd6fa6c 100644 --- a/man/createPlpResultTables.Rd +++ b/man/createPlpResultTables.Rd @@ -1,41 +1,39 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/uploadPlpDbResults.R +% Please edit documentation in R/uploadToDatabase.R \name{createPlpResultTables} \alias{createPlpResultTables} \title{Create the results tables to store PatientLevelPrediction models and results into a database} \usage{ createPlpResultTables( - conn, - resultSchema, + connectionDetails, targetDialect = "postgresql", - deleteExistingTables = T, + resultSchema, + deleteTables = T, createTables = T, - stringAppendToTables = "", + tablePrefix = "", tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), testFile = NULL ) } \arguments{ -\item{conn}{A connection to a database created by using the -function \code{connect} in the -\code{DatabaseConnector} package.} - -\item{resultSchema}{The name of the database schema that the result tables will be created.} +\item{connectionDetails}{The database connection details} \item{targetDialect}{The database management system being used} -\item{deleteExistingTables}{If true any existing tables matching the PatientLevelPrediction result tables names will be deleted} +\item{resultSchema}{The name of the database schema that the result tables will be created.} + +\item{deleteTables}{If true any existing tables matching the PatientLevelPrediction result tables names will be deleted} \item{createTables}{If true the PatientLevelPrediction result tables will be created} -\item{stringAppendToTables}{A string that appends to the PatientLevelPrediction result tables} +\item{tablePrefix}{A string that appends to the PatientLevelPrediction result tables} \item{tempEmulationSchema}{The temp schema used when the database management system is oracle} \item{testFile}{(used for testing) The location of an sql file with the table creation code} } \value{ -Returns NULL but creates the required tables into the specified database schema. +Returns NULL but creates the required tables into the specified database schema(s). } \description{ This function executes a large set of SQL statements to create tables that can store models and results diff --git a/man/createPreprocessSettings.Rd b/man/createPreprocessSettings.Rd index 17ae28208..6ce8ff1f7 100644 --- a/man/createPreprocessSettings.Rd +++ b/man/createPreprocessSettings.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/PreprocessingData.R \name{createPreprocessSettings} \alias{createPreprocessSettings} -\title{Create the settings for preprocessing the trainData using \code{ }.} +\title{Create the settings for preprocessing the trainData.} \usage{ createPreprocessSettings( minFraction = 0.001, @@ -21,7 +21,7 @@ createPreprocessSettings( An object of class \code{preprocessingSettings} } \description{ -Create the settings for preprocessing the trainData using \code{ }. +Create the settings for preprocessing the trainData. } \details{ Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data diff --git a/man/createSampleSettings.Rd b/man/createSampleSettings.Rd index 48855ee98..9c8d0b918 100644 --- a/man/createSampleSettings.Rd +++ b/man/createSampleSettings.Rd @@ -13,9 +13,9 @@ createSampleSettings( } \arguments{ \item{type}{(character) Choice of: \itemize{ -\item{'none'}{ No sampling is applied - this is the default } -\item{'underSample')}{Undersample the non-outcome class to make the data more ballanced} -\item{'overSample'}{Oversample the outcome class by adding in each outcome multiple times} +\item 'none' No sampling is applied - this is the default +\item 'underSample' Undersample the non-outcome class to make the data more ballanced +\item 'overSample' Oversample the outcome class by adding in each outcome multiple times }} \item{numberOutcomestoNonOutcomes}{(numeric) An numeric specifying the require number of non-outcomes per outcome} diff --git a/man/createSplineSettings.Rd b/man/createSplineSettings.Rd new file mode 100644 index 000000000..36a9ae09d --- /dev/null +++ b/man/createSplineSettings.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{createSplineSettings} +\alias{createSplineSettings} +\title{Create the settings for adding a spline for continuous variables} +\usage{ +createSplineSettings(continousCovariateId, knots, analysisId = 683) +} +\arguments{ +\item{continousCovariateId}{The covariateId to apply splines to} + +\item{knots}{Either number of knots of vector of split values} + +\item{analysisId}{The analysisId to use for the spline covariates} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for adding a spline for continuous variables +} +\details{ +Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings +} diff --git a/man/createStratifiedImputationSettings.Rd b/man/createStratifiedImputationSettings.Rd new file mode 100644 index 000000000..8332506dd --- /dev/null +++ b/man/createStratifiedImputationSettings.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FeatureEngineering.R +\name{createStratifiedImputationSettings} +\alias{createStratifiedImputationSettings} +\title{Create the settings for adding a spline for continuous variables} +\usage{ +createStratifiedImputationSettings(covariateId, ageSplits = NULL) +} +\arguments{ +\item{covariateId}{The covariateId that needs imputed values} + +\item{ageSplits}{A vector of age splits in years to create age groups} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for adding a spline for continuous variables +} +\details{ +Returns an object of class \code{featureEngineeringSettings} that specifies how to do stratified imputation +} diff --git a/man/createTempModelLoc.Rd b/man/createTempModelLoc.Rd new file mode 100644 index 000000000..916c955ec --- /dev/null +++ b/man/createTempModelLoc.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HelperFunctions.R +\name{createTempModelLoc} +\alias{createTempModelLoc} +\title{Create a temporary model location} +\usage{ +createTempModelLoc() +} +\description{ +Create a temporary model location +} diff --git a/man/createValidationDesign.Rd b/man/createValidationDesign.Rd new file mode 100644 index 000000000..f54b6aa78 --- /dev/null +++ b/man/createValidationDesign.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{createValidationDesign} +\alias{createValidationDesign} +\title{createValidationDesign - Define the validation design for external validation} +\usage{ +createValidationDesign( + targetId, + outcomeId, + populationSettings, + restrictPlpDataSettings, + plpModelList, + recalibrate = NULL, + runCovariateSummary = TRUE +) +} +\arguments{ +\item{targetId}{The targetId of the target cohort to validate on} + +\item{outcomeId}{The outcomeId of the outcome cohort to validate on} + +\item{populationSettings}{A list of population restriction settings created by \code{createPopulationSettings}} + +\item{restrictPlpDataSettings}{A list of plpData restriction settings created by \code{createRestrictPlpDataSettings}} + +\item{plpModelList}{A list of plpModels objects created by \code{runPlp} or a path to such objects} + +\item{recalibrate}{A vector of characters specifying the recalibration method to apply,} + +\item{runCovariateSummary}{whether to run the covariate summary for the validation data} +} +\description{ +createValidationDesign - Define the validation design for external validation +} diff --git a/man/diagnoseMultiplePlp.Rd b/man/diagnoseMultiplePlp.Rd new file mode 100644 index 000000000..a8a1c6ff7 --- /dev/null +++ b/man/diagnoseMultiplePlp.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DiagnosePlp.R +\name{diagnoseMultiplePlp} +\alias{diagnoseMultiplePlp} +\title{Run a list of predictions diagnoses} +\usage{ +diagnoseMultiplePlp( + databaseDetails = createDatabaseDetails(), + modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings = + setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3, + modelSettings = setLassoLogisticRegression())), + cohortDefinitions = NULL, + logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = + "diagnosePlp Log"), + saveDirectory = getwd() +) +} +\arguments{ +\item{databaseDetails}{The database settings created using \code{createDatabaseDetails()}} + +\item{modelDesignList}{A list of model designs created using \code{createModelDesign()}} + +\item{cohortDefinitions}{A list of cohort definitions for the target and outcome cohorts} + +\item{logSettings}{The setting spexcifying the logging for the analyses created using \code{createLogSettings()}} + +\item{saveDirectory}{Name of the folder where all the outputs will written to.} +} +\value{ +A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier +for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr +\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved + \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr } +} +\description{ +Run a list of predictions diagnoses +} +\details{ +This function will run all specified prediction design diagnoses as defined using . +} diff --git a/man/diagnosePlp.Rd b/man/diagnosePlp.Rd new file mode 100644 index 000000000..e62e999ea --- /dev/null +++ b/man/diagnosePlp.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DiagnosePlp.R +\name{diagnosePlp} +\alias{diagnosePlp} +\title{diagnostic - Investigates the prediction problem settings - use before training a model} +\usage{ +diagnosePlp( + plpData = NULL, + outcomeId, + analysisId, + populationSettings, + splitSettings = createDefaultSplitSetting(), + sampleSettings = createSampleSettings(), + saveDirectory = NULL, + featureEngineeringSettings = createFeatureEngineeringSettings(), + modelSettings = setLassoLogisticRegression(), + logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = + "diagnosePlp Log"), + preprocessSettings = createPreprocessSettings() +) +} +\arguments{ +\item{plpData}{An object of type \code{plpData} - the patient level prediction +data extracted from the CDM. Can also include an initial population as +plpData$popualtion.} + +\item{outcomeId}{(integer) The ID of the outcome.} + +\item{analysisId}{(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.} + +\item{populationSettings}{An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that +specifies how the data class labels are defined and addition any exclusions to apply to the +plpData cohort} + +\item{splitSettings}{An object of type \code{splitSettings} that specifies how to split the data into train/validation/test. +The default settings can be created using \code{createDefaultSplitSetting}.} + +\item{sampleSettings}{An object of type \code{sampleSettings} that specifies any under/over sampling to be done. +The default is none.} + +\item{saveDirectory}{The path to the directory where the results will be saved (if NULL uses working directory)} + +\item{featureEngineeringSettings}{An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)} + +\item{modelSettings}{An object of class \code{modelSettings} created using one of the function: +\itemize{ +\item setLassoLogisticRegression() A lasso logistic regression model +\item setGradientBoostingMachine() A gradient boosting machine +\item setAdaBoost() An ada boost model +\item setRandomForest() A random forest model +\item setDecisionTree() A decision tree model +\item setKNN() A KNN model + +}} + +\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings} +specifying how the logging is done} + +\item{preprocessSettings}{An object of \code{preprocessSettings}. This setting specifies the minimum fraction of +target population who must have a covariate for it to be included in the model training +and whether to normalise the covariates before training} +} +\value{ +An object containing the model or location where the model is save, the data selection settings, the preprocessing +and training settings as well as various performance measures obtained by the model. + +\item{distribution}{list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index) } +\item{incident}{list for each O of incidence of O in T during TAR} +\item{characterization}{list for each O of Characterization of T, TnO, Tn~O} +} +\description{ +This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine +whether the prediction problem is worth executing. +} +\details{ +Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as +follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, +observation time distribution. +} +\examples{ +\dontrun{ +#******** EXAMPLE 1 ********* +} +} diff --git a/man/diagnostic.Rd b/man/diagnostic.Rd deleted file mode 100644 index c9cd53063..000000000 --- a/man/diagnostic.Rd +++ /dev/null @@ -1,59 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Diagnostics.R -\name{diagnostic} -\alias{diagnostic} -\title{diagnostic - Investigates the prediction problem settings - use before training a model} -\usage{ -diagnostic( - plpData = NULL, - cdmDatabaseName = "none", - cohortName, - outcomeNames, - databaseDetails, - restrictPlpDataSettings, - populationSettings, - outputFolder = NULL, - minCellCount = 5 -) -} -\arguments{ -\item{plpData}{The data object to do the diagnostic on - if NULL you need to specify the connection settings below} - -\item{cdmDatabaseName}{The name of the database being diagnosed} - -\item{cohortName}{Name of the target cohort} - -\item{outcomeNames}{Vector of outcome names} - -\item{databaseDetails}{(only used is plpData is NULL) The database details created using \code{createDatabaseDetails}} - -\item{restrictPlpDataSettings}{(only used is plpData is NULL) The restrictPlpDataSettings created using \code{createRestrictPlpDataSettings}} - -\item{populationSettings}{The population setting details created using \code{createPopulationSettings}} - -\item{outputFolder}{Location to save results for shiny app} - -\item{minCellCount}{The minimum count that will be displayed} -} -\value{ -An object containing the model or location where the model is save, the data selection settings, the preprocessing -and training settings as well as various performance measures obtained by the model. - -\item{distribution}{list for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index) } -\item{incident}{list for each O of incidence of O in T during TAR} -\item{characterization}{list for each O of Characterization of T, TnO, Tn~O} -} -\description{ -This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine -whether the prediction problem is worth executing. -} -\details{ -Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as -follow-up time distribution, time-to-event information, characteriszation details, time from last prior event, -observation time distribution. -} -\examples{ -\dontrun{ -#******** EXAMPLE 1 ********* -} -} diff --git a/man/externalValidateDbPlp.Rd b/man/externalValidateDbPlp.Rd index 9582ba332..ad3755bb5 100644 --- a/man/externalValidateDbPlp.Rd +++ b/man/externalValidateDbPlp.Rd @@ -22,7 +22,7 @@ externalValidateDbPlp( \item{settings}{A settings object of class \code{validationSettings} created using \code{createValidationSettings}} -\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings} +\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings} specifying how the logging is done} \item{outputFolder}{The directory to save the validation results to (subfolders are created per database in validationDatabaseDetails)} @@ -34,6 +34,6 @@ A list containing the performance for each validation_schema This function extracts data using a user specified connection and cdm_schema, applied the model and then calcualtes the performance } \details{ -Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the +Users need to input a trained model (the output of runPlp()) and new database connections. The function will return a list of length equal to the number of cdm_schemas input with the performance on the new data } diff --git a/man/extractDatabaseToCsv.Rd b/man/extractDatabaseToCsv.Rd new file mode 100644 index 000000000..98b4a6f08 --- /dev/null +++ b/man/extractDatabaseToCsv.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SaveLoadPlp.R +\name{extractDatabaseToCsv} +\alias{extractDatabaseToCsv} +\title{Exports all the results from a database into csv files} +\usage{ +extractDatabaseToCsv( + conn = NULL, + connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"), + csvFolder, + minCellCount = 5, + sensitiveColumns = getPlpSensitiveColumns(), + fileAppend = NULL +) +} +\arguments{ +\item{conn}{The connection to the database with the results} + +\item{connectionDetails}{The connectionDetails for the result database} + +\item{databaseSchemaSettings}{The result database schema settings} + +\item{csvFolder}{Location to save the csv files} + +\item{minCellCount}{The min value to show in cells that are sensitive (values less than this value will be replaced with -1)} + +\item{sensitiveColumns}{A named list (name of table columns belong to) with a list of columns to apply the minCellCount to.} + +\item{fileAppend}{If set to a string this will be appended to the start of the csv file names} +} +\description{ +Exports all the results from a database into csv files +} +\details{ +Extracts the results from a database into a set of csv files +} diff --git a/man/fitPlp.Rd b/man/fitPlp.Rd index 8ed1a5e98..5f52edb8b 100644 --- a/man/fitPlp.Rd +++ b/man/fitPlp.Rd @@ -4,7 +4,7 @@ \alias{fitPlp} \title{fitPlp} \usage{ -fitPlp(trainData, modelSettings, search = "grid", analysisId) +fitPlp(trainData, modelSettings, search = "grid", analysisId, analysisPath) } \arguments{ \item{trainData}{An object of type \code{TrainData} created using \code{splitData} @@ -12,27 +12,27 @@ data extracted from the CDM.} \item{modelSettings}{An object of class \code{modelSettings} created using one of the function: \itemize{ -\item{logisticRegressionModel()}{ A lasso logistic regression model} -\item{GBMclassifier()}{ A gradient boosting machine} -\item{RFclassifier()}{ A random forest model} -\item{GLMclassifier ()}{ A generalised linear model} -\item{KNNclassifier()}{ A KNN model} +\item setLassoLogisticRegression() A lasso logistic regression model +\item setGradientBoostingMachine() A gradient boosting machine +\item setRandomForest() A random forest model +\item setKNN() A KNN model }} \item{search}{The search strategy for the hyper-parameter selection (currently not used)} \item{analysisId}{The id of the analysis} + +\item{analysisPath}{The path of the analysis} } \value{ An object of class \code{plpModel} containing: \item{model}{The trained prediction model} -\item{modelLoc}{The path to where the model is saved (if saved)} -\item{trainAuc}{The AUC obtained on the training set} -\item{trainCalibration}{The calibration obtained on the training set} -\item{modelSettings}{A list specifiying the model, preprocessing, outcomeId and cohortId} -\item{metaData}{The model meta data} -\item{trainingTime}{The time taken to train the classifier} +\item{preprocessing}{The preprocessing required when applying the model} +\item{prediction}{The cohort data.frame with the predicted risk column added} +\item{modelDesign}{A list specifiying the modelDesign settings used to fit the model} +\item{trainDetails}{The model meta data} +\item{covariateImportance}{The covariate importance for the model} } \description{ Train various models using a default parameter gird search or user specified parameters diff --git a/man/getCohortCovariateData.Rd b/man/getCohortCovariateData.Rd index 1014cad53..5c0c515f3 100644 --- a/man/getCohortCovariateData.Rd +++ b/man/getCohortCovariateData.Rd @@ -12,8 +12,9 @@ getCohortCovariateData( cohortTable = "#cohort_person", rowIdField = "row_id", aggregated, - cohortId, - covariateSettings + cohortIds, + covariateSettings, + ... ) } \arguments{ @@ -31,9 +32,11 @@ getCohortCovariateData( \item{aggregated}{whether the covariate should be aggregated} -\item{cohortId}{cohort id for the target population cohort} +\item{cohortIds}{cohort id for the target cohort} \item{covariateSettings}{settings for the covariate cohorts and time periods} + +\item{...}{additional arguments from FeatureExtraction} } \value{ The models will now be in the package diff --git a/man/insertCsvToDatabase.Rd b/man/insertCsvToDatabase.Rd new file mode 100644 index 000000000..3b9dd9015 --- /dev/null +++ b/man/insertCsvToDatabase.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ImportFromCsv.R +\name{insertCsvToDatabase} +\alias{insertCsvToDatabase} +\title{Function to insert results into a database from csvs} +\usage{ +insertCsvToDatabase( + csvFolder, + connectionDetails, + databaseSchemaSettings, + modelSaveLocation, + csvTableAppend = "" +) +} +\arguments{ +\item{csvFolder}{The location to the csv folder with the plp results} + +\item{connectionDetails}{A connection details for the plp results database that the csv results will be inserted into} + +\item{databaseSchemaSettings}{A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables to insert the csv results into} + +\item{modelSaveLocation}{The location to save any models from the csv folder - this should be the same location you picked when inserting other models into the database} + +\item{csvTableAppend}{A string that appends the csv file names} +} +\value{ +Returns a data.frame indicating whether the results were inported into the database +} +\description{ +This function converts a folder with csv results into plp objects and loads them into a plp result database +} +\details{ +The user needs to have plp csv results in a single folder and an existing plp result database +} diff --git a/man/insertModelDesignInDatabase.Rd b/man/insertModelDesignInDatabase.Rd new file mode 100644 index 000000000..7806d6c95 --- /dev/null +++ b/man/insertModelDesignInDatabase.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabaseModelDesign.R +\name{insertModelDesignInDatabase} +\alias{insertModelDesignInDatabase} +\title{Insert a model design into a PLP result schema database} +\usage{ +insertModelDesignInDatabase( + object, + conn, + databaseSchemaSettings, + cohortDefinitions +) +} +\arguments{ +\item{object}{An object of class modelDesign, runPlp or externalValidatePlp} + +\item{conn}{A connection to a database created by using the +function \code{connect} in the +\code{DatabaseConnector} package.} + +\item{databaseSchemaSettings}{A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables} + +\item{cohortDefinitions}{A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()} +} +\value{ +Returns NULL but uploads the model design into the database schema specified in databaseSchemaSettings +} +\description{ +This function inserts a model design and all the settings into the result schema +} +\details{ +This function can be used to upload a model design into a database +} diff --git a/man/insertResultsToSqlite.Rd b/man/insertResultsToSqlite.Rd new file mode 100644 index 000000000..0376c3b9e --- /dev/null +++ b/man/insertResultsToSqlite.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/uploadToDatabase.R +\name{insertResultsToSqlite} +\alias{insertResultsToSqlite} +\title{Create sqlite database with the results} +\usage{ +insertResultsToSqlite( + resultLocation, + cohortDefinitions, + databaseList = NULL, + sqliteLocation = file.path(resultLocation, "sqlite") +) +} +\arguments{ +\item{resultLocation}{(string) location of directory where the main package results were saved} + +\item{cohortDefinitions}{A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()} + +\item{databaseList}{A list created by \code{createDatabaseList} to specify the databases} + +\item{sqliteLocation}{(string) location of directory where the sqlite database will be saved} +} +\value{ +Returns the location of the sqlite database file +} +\description{ +This function create an sqlite database with the PLP result schema and inserts all results +} +\details{ +This function can be used upload PatientLevelPrediction results into an sqlite database +} diff --git a/man/launchDiagnosticsExplorer.Rd b/man/launchDiagnosticsExplorer.Rd deleted file mode 100644 index cd6adf7f7..000000000 --- a/man/launchDiagnosticsExplorer.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ViewShinyPlp.R -\name{launchDiagnosticsExplorer} -\alias{launchDiagnosticsExplorer} -\title{Launch the Diagnostics Explorer Shiny app} -\usage{ -launchDiagnosticsExplorer(dataFolder, launch.browser = FALSE) -} -\arguments{ -\item{dataFolder}{A folder where the exported zip files with the results are stored. -Zip files containing results from multiple databases can be placed in the same -folder.} - -\item{launch.browser}{Should the app be launched in your default browser, or in a Shiny window. -Note: copying to clipboard will not work in a Shiny window.} -} -\description{ -Launch the Diagnostics Explorer Shiny app -} -\details{ -Launches a Shiny app that allows the user to explore the diagnostics -} diff --git a/man/listCartesian.Rd b/man/listCartesian.Rd new file mode 100644 index 000000000..4c6ffaba8 --- /dev/null +++ b/man/listCartesian.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SklearnClassifierHelpers.R +\name{listCartesian} +\alias{listCartesian} +\title{Cartesian product} +\usage{ +listCartesian(allList) +} +\arguments{ +\item{allList}{a list of lists} +} +\value{ +A list with all possible combinations from the input list of lists +} +\description{ +Computes the Cartesian product of all the combinations of elements in a list +} diff --git a/man/migrateDataModel.Rd b/man/migrateDataModel.Rd new file mode 100644 index 000000000..597d37637 --- /dev/null +++ b/man/migrateDataModel.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DatabaseMigration.R +\name{migrateDataModel} +\alias{migrateDataModel} +\title{Migrate Data model} +\usage{ +migrateDataModel(connectionDetails, databaseSchema, tablePrefix = "") +} +\arguments{ +\item{connectionDetails}{DatabaseConnector connection details object} + +\item{databaseSchema}{String schema where database schema lives} + +\item{tablePrefix}{(Optional) Use if a table prefix is used before table names (e.g. "cd_")} +} +\description{ +Migrate data from current state to next state + +It is strongly advised that you have a backup of all data (either sqlite files, a backup database (in the case you +are using a postgres backend) or have kept the csv/zip files from your data generation. +} diff --git a/man/outcomeSurvivalPlot.Rd b/man/outcomeSurvivalPlot.Rd index 0df4f85d7..39100718d 100644 --- a/man/outcomeSurvivalPlot.Rd +++ b/man/outcomeSurvivalPlot.Rd @@ -7,10 +7,10 @@ outcomeSurvivalPlot( plpData, outcomeId, - populationSettings = createStudyPopulationSettings(binary = T, includeAllOutcomes = - T, firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome = - TRUE, priorOutcomeLookback = 99999, requireTimeAtRisk = F, riskWindowStart = 1, - startAnchor = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"), + populationSettings = createStudyPopulationSettings(binary = T, includeAllOutcomes = T, + firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome = TRUE, + priorOutcomeLookback = 99999, requireTimeAtRisk = F, riskWindowStart = 1, startAnchor + = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"), riskTable = T, confInt = T, yLabel = "Fraction of those who are outcome free in target population" diff --git a/man/populatePlpResultTables.Rd b/man/populatePlpResultTables.Rd deleted file mode 100644 index 3519edecd..000000000 --- a/man/populatePlpResultTables.Rd +++ /dev/null @@ -1,97 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/uploadPlpDbResults.R -\name{populatePlpResultTables} -\alias{populatePlpResultTables} -\title{Populate the PatientLevelPrediction results tables} -\usage{ -populatePlpResultTables( - conn, - resultSchema, - stringAppendToTables = "", - targetDialect = "postgresql", - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), - packageName, - studyJsonList, - studyName = "", - studyDescription = "", - researcherName = "", - researcherEmail = "", - researcherOrg = "", - databaseName = NULL, - databaseAcronym = NULL, - databaseVersion = 1, - databaseDescription = NULL, - databaseType = NULL, - valDatabases = list(ccae = list(name = "CCAE", description = "", version = 1, type = - "US Claims")), - resultLocation = NULL, - resultPattern = "", - validationLocation = file.path(resultLocation, "Validation"), - addInternalValidation = T, - addExternalValidation = T, - gsubVal = NULL, - removePattern = NULL -) -} -\arguments{ -\item{conn}{A connection to a database created by using the -function \code{connect} in the -\code{DatabaseConnector} package.} - -\item{resultSchema}{(string) The name of the database schema with the result tables.} - -\item{stringAppendToTables}{(string) A string that appends to the PatientLevelPrediction result tables} - -\item{targetDialect}{(string) The database management system being used} - -\item{tempEmulationSchema}{(string) The temp schema used when the database management system is oracle} - -\item{packageName}{(string) The name of the ATLAS R package used to generate the results (this is used to extract cohort jsons)} - -\item{studyJsonList}{(list) A list of lists per cohort with the cohort_name, cohort_id and cohort_json} - -\item{studyName}{(string) A reference study name} - -\item{studyDescription}{(string) A description of the study} - -\item{researcherName}{(string) Name of the researcher who developed the study} - -\item{researcherEmail}{(string) Email of the researcher who developed the study} - -\item{researcherOrg}{(string) Organisation of the researcher who developed the study} - -\item{databaseName}{(string) name of the database used to develop the model/s} - -\item{databaseAcronym}{(string) acronym of the database used to develop the model/s} - -\item{databaseVersion}{(int) Version of the database used to develop the model/s} - -\item{databaseDescription}{(string) Description of the database used to develop the model/s} - -\item{databaseType}{(string) Type of the database used to develop the model/s (e.g., claims)} - -\item{valDatabases}{(list) A named list with details of the external validation databases. Needs to contain: name, description, version, type.} - -\item{resultLocation}{(string) location of directory where the main package results were saved} - -\item{resultPattern}{(string) A string to match to select models of interest} - -\item{validationLocation}{(string) location of directory where the validation package results were saved} - -\item{addInternalValidation}{(boolean) Whether the internval validation results should be uploaded} - -\item{addExternalValidation}{(boolean) Whether the externval validation results should be uploaded} - -\item{gsubVal}{(string) Remove patterns from the result name} - -\item{removePattern}{(string) Restrict to result names with this pattern} -} -\value{ -Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema -} -\description{ -This function formats and uploads results that have been generated via an ATLAS prediction package into a database -} -\details{ -This function can be used upload PatientLevelPrediction results into a database -} diff --git a/man/runMultiplePlp.Rd b/man/runMultiplePlp.Rd index b60663784..22192b5a8 100644 --- a/man/runMultiplePlp.Rd +++ b/man/runMultiplePlp.Rd @@ -10,12 +10,11 @@ runMultiplePlp( setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())), onlyFetchData = F, - splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25, - trainFraction = 0.75, splitSeed = 123, nfold = 3), cohortDefinitions = NULL, logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log"), - saveDirectory = getwd() + saveDirectory = getwd(), + sqliteLocation = file.path(saveDirectory, "sqlite") ) } \arguments{ @@ -25,18 +24,18 @@ runMultiplePlp( \item{onlyFetchData}{Only fetches and saves the data object to the output folder without running the analysis.} -\item{splitSettings}{The train/validation/test splitting used by all analyses created using \code{createDefaultSplitSetting()}} - \item{cohortDefinitions}{A list of cohort definitions for the target and outcome cohorts} -\item{logSettings}{The setting spexcifying the logging for the analyses created using \code{createLogSettings()}} +\item{logSettings}{The setting specifying the logging for the analyses created using \code{createLogSettings()}} \item{saveDirectory}{Name of the folder where all the outputs will written to.} + +\item{sqliteLocation}{(optional) The location of the sqlite database with the results} } \value{ A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier -for a set of analysis choices.\cr \verb{cohortId} \tab The ID of the target cohort populations.\cr -\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved \cr \verb{evaluationFolder} \tab The name of file containing the evaluation saved as a csv +for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr +\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr } } \description{ diff --git a/man/runPlp.Rd b/man/runPlp.Rd index 6d3b99db1..ad000f4b7 100644 --- a/man/runPlp.Rd +++ b/man/runPlp.Rd @@ -24,7 +24,8 @@ runPlp( } \arguments{ \item{plpData}{An object of type \code{plpData} - the patient level prediction -data extracted from the CDM.} +data extracted from the CDM. Can also include an initial population as +plpData$popualtion.} \item{outcomeId}{(integer) The ID of the outcome.} @@ -50,17 +51,12 @@ and whether to normalise the covariates before training} \item{modelSettings}{An object of class \code{modelSettings} created using one of the function: \itemize{ -\item{setLassoLogisticRegression()}{ A lasso logistic regression model} -\item{setGradientBoostingMachine()}{ A gradient boosting machine} -\item{setAdaBoost()}{ An ada boost model} -\item{setRandomForest()}{ A random forest model} -\item{setDecisionTree()}{ A decision tree model} -\item{setCovNN())}{ A convolutional neural network model} -\item{setCIReNN()}{ A recurrent neural network model} -\item{setMLP()}{ A neural network model} -\item{setDeepNN()}{ A deep neural network model} -\item{setKNN()}{ A KNN model} - +\item setLassoLogisticRegression() A lasso logistic regression model +\item setGradientBoostingMachine() A gradient boosting machine +\item setAdaBoost() An ada boost model +\item setRandomForest() A random forest model +\item setDecisionTree() A decision tree model +\item setKNN() A KNN model }} \item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings} @@ -74,13 +70,12 @@ specifying how the logging is done} An object containing the following: \itemize{ - \item{inputSettings}{A list containing all the settings used to develop the model} - \item{model}{ The developed model of class \code{plpModel}} - \item{executionSummary}{ A list containing the hardward details, R package details and execution time} - \item{performanceEvaluation}{ Various internal performance metrics in sparse format} - \item{prediction}{ The plpData cohort table with the predicted risks added as a column (named value)} - \item{covariateSummary)}{ A characterization of the features for patients with and without the outcome during the time at risk} - \item{analysisRef}{ A list with details about the analysis} + \item model The developed model of class \code{plpModel} + \item executionSummary A list containing the hardward details, R package details and execution time + \item performanceEvaluation Various internal performance metrics in sparse format + \item prediction The plpData cohort table with the predicted risks added as a column (named value) + \item covariateSummary A characterization of the features for patients with and without the outcome during the time at risk + \item analysisRef A list with details about the analysis } } \description{ diff --git a/man/savePlpAnalysesJson.Rd b/man/savePlpAnalysesJson.Rd index 1be1cc384..742631e6f 100644 --- a/man/savePlpAnalysesJson.Rd +++ b/man/savePlpAnalysesJson.Rd @@ -8,12 +8,15 @@ savePlpAnalysesJson( modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())), + cohortDefinitions = NULL, saveDirectory = NULL ) } \arguments{ \item{modelDesignList}{A list of modelDesigns created using \code{createModelDesign()}} +\item{cohortDefinitions}{A list of the cohortDefinitions (generally extracted from ATLAS)} + \item{saveDirectory}{The directory to save the modelDesignList settings} } \description{ diff --git a/man/setAdaBoost.Rd b/man/setAdaBoost.Rd index 0572f7828..971948d00 100644 --- a/man/setAdaBoost.Rd +++ b/man/setAdaBoost.Rd @@ -26,7 +26,7 @@ Create setting for AdaBoost with python DecisionTreeClassifier base estimator } \examples{ \dontrun{ -model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), +model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1), algorithm = list('SAMME.R'), seed = sample(1000000,1) ) } diff --git a/man/setDecisionTree.Rd b/man/setDecisionTree.Rd index 7a32bcf8c..d977d0ee9 100644 --- a/man/setDecisionTree.Rd +++ b/man/setDecisionTree.Rd @@ -11,10 +11,10 @@ setDecisionTree( minSamplesSplit = list(2, 10), minSamplesLeaf = list(10, 50), minWeightFractionLeaf = list(0), - maxFeatures = list(100, "auto", NULL), + maxFeatures = list(100, "sqrt", NULL), maxLeafNodes = list(NULL), minImpurityDecrease = list(10^-7), - classWeight = list(NULL, "balanced"), + classWeight = list(NULL), seed = sample(1e+06, 1) ) } @@ -31,7 +31,7 @@ setDecisionTree( \item{minWeightFractionLeaf}{The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.} -\item{maxFeatures}{(list) The number of features to consider when looking for the best split (int/'auto'/NULL)} +\item{maxFeatures}{(list) The number of features to consider when looking for the best split (int/'sqrt'/NULL)} \item{maxLeafNodes}{(list) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. (int/NULL)} diff --git a/man/setLassoLogisticRegression.Rd b/man/setLassoLogisticRegression.Rd index f3b1ce51b..533aa53b3 100644 --- a/man/setLassoLogisticRegression.Rd +++ b/man/setLassoLogisticRegression.Rd @@ -14,7 +14,8 @@ setLassoLogisticRegression( upperLimit = 20, lowerLimit = 0.01, tolerance = 2e-06, - maxIterations = 3000 + maxIterations = 3000, + priorCoefs = NULL ) } \arguments{ @@ -37,6 +38,8 @@ setLassoLogisticRegression( \item{tolerance}{Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence} \item{maxIterations}{Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error} + +\item{priorCoefs}{Use coefficients from a previous model as starting points for model fit (transfer learning)} } \description{ Create setting for lasso logistic regression diff --git a/man/setLightGBM.Rd b/man/setLightGBM.Rd new file mode 100644 index 000000000..6380df304 --- /dev/null +++ b/man/setLightGBM.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LightGBM.R +\name{setLightGBM} +\alias{setLightGBM} +\title{Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package).} +\usage{ +setLightGBM( + nthread = 20, + earlyStopRound = 25, + numIterations = c(100), + numLeaves = c(31), + maxDepth = c(5, 10), + minDataInLeaf = c(20), + learningRate = c(0.05, 0.1, 0.3), + lambdaL1 = c(0), + lambdaL2 = c(0), + scalePosWeight = 1, + isUnbalance = FALSE, + seed = sample(1e+07, 1) +) +} +\arguments{ +\item{nthread}{The number of computer threads to use (how many cores do you have?)} + +\item{earlyStopRound}{If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting)} + +\item{numIterations}{Number of boosting iterations.} + +\item{numLeaves}{This hyperparameter sets the maximum number of leaves. Increasing this parameter can lead to higher model complexity and potential overfitting.} + +\item{maxDepth}{This hyperparameter sets the maximum depth . Increasing this parameter can also lead to higher model complexity and potential overfitting.} + +\item{minDataInLeaf}{This hyperparameter sets the minimum number of data points that must be present in a leaf node. Increasing this parameter can help to reduce overfitting} + +\item{learningRate}{This hyperparameter controls the step size at each iteration of the gradient descent algorithm. Lower values can lead to slower convergence but may result in better performance.} + +\item{lambdaL1}{This hyperparameter controls L1 regularization, which can help to reduce overfitting by encouraging sparse models.} + +\item{lambdaL2}{This hyperparameter controls L2 regularization, which can also help to reduce overfitting by discouraging large weights in the model.} + +\item{scalePosWeight}{Controls weight of positive class in loss - useful for imbalanced classes} + +\item{isUnbalance}{This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities.} + +\item{seed}{An option to add a seed when training the final model} +} +\description{ +Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package). +} +\examples{ +model.lightgbm <- setLightGBM( + numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10), + minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3) +) + +} diff --git a/man/setMLP.Rd b/man/setMLP.Rd index 0a70acd25..d6dea6254 100644 --- a/man/setMLP.Rd +++ b/man/setMLP.Rd @@ -5,7 +5,7 @@ \title{Create setting for neural network model with python} \usage{ setMLP( - hiddenLayerSizes = list(c(100), c(20, 4)), + hiddenLayerSizes = list(c(100), c(20)), activation = list("relu"), solver = list("adam"), alpha = list(0.3, 0.01, 1e-04, 1e-06), @@ -23,7 +23,7 @@ setMLP( validationFraction = list(0.1), beta1 = list(0.9), beta2 = list(0.999), - epsilon = list(1, 0.1, 1e-08), + epsilon = list(1e-08), nIterNoChange = list(10), seed = sample(1e+05, 1) ) @@ -33,10 +33,10 @@ setMLP( \item{activation}{(list) Activation function for the hidden layer. \itemize{ - \item{"identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x} - \item{"logistic": the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).} - \item{"tanh": the hyperbolic tan function, returns f(x) = tanh(x).} - \item{"relu": the rectified linear unit function, returns f(x) = max(0, x)} + \item "identity": no-op activation, useful to implement linear bottleneck, returns f(x) = x + \item "logistic": the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)). + \item "tanh": the hyperbolic tan function, returns f(x) = tanh(x). + \item "relu": the rectified linear unit function, returns f(x) = max(0, x) }} \item{solver}{(list) The solver for weight optimization. (‘lbfgs’, ‘sgd’, ‘adam’)} @@ -45,7 +45,7 @@ setMLP( \item{batchSize}{(list) Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch. When set to “auto”, batchSize=min(200, n_samples).} -\item{learningRate}{(list) Only used when solver='sgd' Learning rate schedule for weight updates.{‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’} +\item{learningRate}{(list) Only used when solver='sgd' Learning rate schedule for weight updates. ‘constant’, ‘invscaling’, ‘adaptive’, default=’constant’} \item{learningRateInit}{(list) Only used when solver=’sgd’ or ‘adam’. The initial learning rate used. It controls the step-size in updating the weights.} diff --git a/man/setRandomForest.Rd b/man/setRandomForest.Rd index 2fb1b7b37..360b532ae 100644 --- a/man/setRandomForest.Rd +++ b/man/setRandomForest.Rd @@ -11,14 +11,14 @@ setRandomForest( minSamplesSplit = list(2, 5), minSamplesLeaf = list(1, 10), minWeightFractionLeaf = list(0), - mtries = list("auto", "log2"), + mtries = list("sqrt", "log2"), maxLeafNodes = list(NULL), minImpurityDecrease = list(0), bootstrap = list(TRUE), maxSamples = list(NULL, 0.9), oobScore = list(FALSE), nJobs = list(NULL), - classWeight = list("balanced_subsample", NULL), + classWeight = list(NULL), seed = sample(1e+05, 1) ) } @@ -35,14 +35,13 @@ setRandomForest( \item{minWeightFractionLeaf}{(list) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sampleWeight is not provided.} -\item{mtries}{(list) The number of features to consider when looking for the best split: +\item{mtries}{(list) The number of features to consider when looking for the best split: \itemize{ -\item{int}{then consider max_features features at each split.} -\item{float}{then max_features is a fraction and round(max_features * n_features) features are considered at each split} -\item{'auto'}{then max_features=sqrt(n_features)} -\item{'sqrt'}{then max_features=sqrt(n_features) (same as “auto”)} -\item{'log2'}{then max_features=log2(n_features).} -\item{NULL}{then max_features=n_features} +\item int then consider max_features features at each split. +\item float then max_features is a fraction and round(max_features * n_features) features are considered at each split +\item 'sqrt' then max_features=sqrt(n_features) +\item 'log2' then max_features=log2(n_features) +\item NULL then max_features=n_features }} \item{maxLeafNodes}{(list) Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.} @@ -57,7 +56,7 @@ setRandomForest( \item{nJobs}{The number of jobs to run in parallel.} -\item{classWeight}{(list) Weights associated with classes. If not given, all classes are supposed to have weight one. {NULL, “balanced”, “balanced_subsample”}} +\item{classWeight}{(list) Weights associated with classes. If not given, all classes are supposed to have weight one. NULL, “balanced”, “balanced_subsample”} \item{seed}{A seed when training the final model} } @@ -66,7 +65,7 @@ Create setting for random forest model with python (very fast) } \examples{ \dontrun{ -model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), +model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100), maxDepth=c(5,20)) -} +} } diff --git a/man/setSVM.Rd b/man/setSVM.Rd index 4edccc620..2def0720b 100644 --- a/man/setSVM.Rd +++ b/man/setSVM.Rd @@ -12,7 +12,7 @@ setSVM( coef0 = list(0), shrinking = list(TRUE), tol = list(0.001), - classWeight = list("balanced", NULL), + classWeight = list(NULL), cacheSize = 500, seed = sample(1e+05, 1) ) @@ -24,7 +24,7 @@ setSVM( \item{degree}{(list) degree of kernel function is significant only in poly, rbf, sigmoid} -\item{gamma}{(list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. {‘scale’, ‘auto’} or float, default=’scale’} +\item{gamma}{(list) kernel coefficient for rbf and poly, by default 1/n_features will be taken. ‘scale’, ‘auto’ or float, default=’scale’} \item{coef0}{(list) independent term in kernel function. It is only significant in poly/sigmoid.} diff --git a/man/sklearnFromJson.Rd b/man/sklearnFromJson.Rd new file mode 100644 index 000000000..402bf1b01 --- /dev/null +++ b/man/sklearnFromJson.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SklearnToJson.R +\name{sklearnFromJson} +\alias{sklearnFromJson} +\title{Loads sklearn python model from json} +\usage{ +sklearnFromJson(path) +} +\arguments{ +\item{path}{path to the model json file} +} +\description{ +Loads sklearn python model from json +} diff --git a/man/sklearnToJson.Rd b/man/sklearnToJson.Rd new file mode 100644 index 000000000..ce5d0b2d0 --- /dev/null +++ b/man/sklearnToJson.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SklearnToJson.R +\name{sklearnToJson} +\alias{sklearnToJson} +\title{Saves sklearn python model object to json in path} +\usage{ +sklearnToJson(model, path) +} +\arguments{ +\item{model}{a fitted sklearn python model object} + +\item{path}{path to the saved model file} +} +\description{ +Saves sklearn python model object to json in path +} diff --git a/man/splitData.Rd b/man/splitData.Rd index 065099117..4c03f61b8 100644 --- a/man/splitData.Rd +++ b/man/splitData.Rd @@ -26,14 +26,14 @@ Split the plpData into test/train sets using a splitting settings of class \code } \details{ Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing -\itemize{\item{covariates}{ a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data } - \item{covariateRef}{ a table with the covariate information} - \item{labels)}{ a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) } - \item{folds}{ a table (rowId, index) specifying which training fold each data point is in.} +\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data + \item covariateRef: a table with the covariate information + \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label) + \item folds: a table (rowId, index) specifying which training fold each data point is in. } Test is an Andromeda object containing -\itemize{\item{covariates}{ a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data } - \item{covariateRef}{ a table with the covariate information} - \item{labels)}{ a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) } +\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data + \item covariateRef: a table with the covariate information + \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label) } } diff --git a/man/validateExternal.Rd b/man/validateExternal.Rd new file mode 100644 index 000000000..248dd8459 --- /dev/null +++ b/man/validateExternal.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{validateExternal} +\alias{validateExternal} +\title{externalValidatePlp - Validate model performance on new data} +\usage{ +validateExternal( + validationDesignList, + databaseDetails, + logSettings, + outputFolder +) +} +\arguments{ +\item{validationDesignList}{A list of objects created with \code{createValidationDesign}} + +\item{databaseDetails}{A list of objects of class +\code{databaseDetails} created using \code{createDatabaseDetails}} + +\item{logSettings}{An object of \code{logSettings} created +using \code{createLogSettings}} + +\item{outputFolder}{The directory to save the validation results to +(subfolders are created per database in validationDatabaseDetails)} +} +\description{ +externalValidatePlp - Validate model performance on new data +} diff --git a/man/validateMultiplePlp.Rd b/man/validateMultiplePlp.Rd index ed14fc5c8..2ab5bc903 100644 --- a/man/validateMultiplePlp.Rd +++ b/man/validateMultiplePlp.Rd @@ -9,18 +9,21 @@ validateMultiplePlp( validationDatabaseDetails, validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), recalibrate = NULL, + cohortDefinitions = NULL, saveDirectory = NULL ) } \arguments{ \item{analysesLocation}{The location where the multiple plp analyses are} -\item{validationDatabaseDetails}{The validation database settings created using \code{createDatabaseDetails()}} +\item{validationDatabaseDetails}{A single or list of validation database settings created using \code{createDatabaseDetails()}} \item{validationRestrictPlpDataSettings}{The settings specifying the extra restriction settings when extracting the data created using \code{createRestrictPlpDataSettings()}.} \item{recalibrate}{A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')} +\item{cohortDefinitions}{A list of cohortDefinitions} + \item{saveDirectory}{The location to save to validation results} } \description{ diff --git a/man/viewPlp.Rd b/man/viewPlp.Rd index 5872efaa7..38ca332eb 100644 --- a/man/viewPlp.Rd +++ b/man/viewPlp.Rd @@ -4,12 +4,14 @@ \alias{viewPlp} \title{viewPlp - Interactively view the performance and model settings} \usage{ -viewPlp(runPlp, validatePlp = NULL) +viewPlp(runPlp, validatePlp = NULL, diagnosePlp = NULL) } \arguments{ \item{runPlp}{The output of runPlp() (an object of class 'runPlp')} \item{validatePlp}{The output of externalValidatePlp (on object of class 'validatePlp')} + +\item{diagnosePlp}{The output of diagnosePlp()} } \value{ Opens a shiny app for interactively viewing the results diff --git a/tests/testthat/helper-expectations.R b/tests/testthat/helper-expectations.R new file mode 100644 index 000000000..25210fdf6 --- /dev/null +++ b/tests/testthat/helper-expectations.R @@ -0,0 +1,29 @@ +# common tests that can be grouped together, such as testing the output from fitplp + +expect_correct_fitPlp <- function(plpModel, trainData) { + + # predictions are same amount as labels + multiplicativeFactor <- dplyr::n_distinct(plpModel$prediction %>% dplyr::pull(evaluationType)) + expect_equal(NROW(trainData$labels)*multiplicativeFactor, NROW(plpModel$prediction)) + + # predictions are all between 0 and 1 + expect_true(all((plpModel$prediction$value >= 0) & + (plpModel$prediction$value <= 1))) + + # model directory exists + expect_true(dir.exists(plpModel$model)) + + expect_equal(plpModel$modelDesign$outcomeId, outcomeId) + expect_equal(plpModel$modelDesign$targetId, 1) + + # structure of plpModel is correct + expect_equal(names(plpModel), c("model", "preprocessing", "prediction", + "modelDesign", "trainDetails", "covariateImportance")) +} + +expect_correct_predictions <- function(predictions, testData) { + + # predictions are all between 0 and 1 + expect_true(all((predictions$value >= 0) & (predictions$value <= 1))) + expect_equal(NROW(testData$labels), NROW(predictions)) +} diff --git a/tests/testthat/helper-functions.R b/tests/testthat/helper-functions.R new file mode 100644 index 000000000..3a44091e8 --- /dev/null +++ b/tests/testthat/helper-functions.R @@ -0,0 +1,34 @@ +# helper functions for tests + +# copies trainData and makes sure andromeda object is copied correctly +copyTrainData <- function(trainData) { + newTrainData <- trainData + + # force andromeda to copy + newTrainData$covariateData <- Andromeda::copyAndromeda(trainData$covariateData) + + class(newTrainData$covariateData) <- class(trainData$covariateData) + return(newTrainData) +} + +# create tiny dataset with subset of covariates based on lasso fit +createTinyPlpData <- function(plpData, plpResult, n= 20) { + + covariates <- plpResult$model$covariateImportance %>% + dplyr::slice_max(order_by = abs(.data$covariateValue), + n = n, with_ties = F) %>% + dplyr::pull(.data$covariateId) + tinyPlpData <- plpData + tinyPlpData$covariateData <- Andromeda::copyAndromeda(plpData$covariateData) + + tinyPlpData$covariateData$covariates <- plpData$covariateData$covariates %>% + dplyr::filter(covariateId %in% covariates) + tinyPlpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>% + dplyr::filter(covariateId %in% covariates) + + attributes(tinyPlpData$covariateData)$metaData <- attributes(plpData$covariateData)$metaData + class(tinyPlpData$covariateData) <- class(plpData$covariateData) + attributes(tinyPlpData)$metaData <- attributes(plpData)$metaData + class(tinyPlpData) <- class(plpData) + return(tinyPlpData) +} \ No newline at end of file diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 51fa98fdf..fccf9bf1d 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -1,50 +1,69 @@ -# Download the PostreSQL driver --------------------------- -# If DATABASECONNECTOR_JAR_FOLDER exists, assume driver has been downloaded -jarFolder <- Sys.getenv("DATABASECONNECTOR_JAR_FOLDER", unset = "") -if (jarFolder == "") { - tempJarFolder <- tempfile("jdbcDrivers") - dir.create(tempJarFolder) - Sys.setenv("DATABASECONNECTOR_JAR_FOLDER" = tempJarFolder) - downloadJdbcDrivers("postgresql") - - withr::defer({ - unlink(tempJarFolder, recursive = TRUE, force = TRUE) - Sys.unsetenv("DATABASECONNECTOR_JAR_FOLDER") - }, testthat::teardown_env()) -} - - # this files contains the objects used in the tests: -travis <- T - -saveLoc <- tempfile("saveLoc") -dir.create(saveLoc) - - -if(ifelse(is.null(Sys.info()), T, Sys.info()['sysname'] != 'Windows')){ +if (Sys.getenv('GITHUB_ACTIONS') == 'true') { + # Download the PostreSQL driver --------------------------- + # If DATABASECONNECTOR_JAR_FOLDER exists, assume driver has been downloaded + jarFolder <- Sys.getenv("DATABASECONNECTOR_JAR_FOLDER", unset = "") + if (jarFolder == "") { + tempJarFolder <- tempfile("jdbcDrivers") + dir.create(tempJarFolder) + Sys.setenv("DATABASECONNECTOR_JAR_FOLDER" = tempJarFolder) + DatabaseConnector::downloadJdbcDrivers("postgresql") + + withr::defer({ + unlink(tempJarFolder, recursive = TRUE, force = TRUE) + Sys.unsetenv("DATABASECONNECTOR_JAR_FOLDER") + }, testthat::teardown_env()) + } + # configure and activate python PatientLevelPrediction::configurePython(envname = 'r-reticulate', envtype = "conda") PatientLevelPrediction::setPythonEnvironment(envname = 'r-reticulate', envtype = "conda") # if mac install nomkl -- trying to fix github actions - if(ifelse(is.null(Sys.info()), F, Sys.info()['sysname'] == 'Darwin')){ + if (ifelse(is.null(Sys.info()), F, Sys.info()['sysname'] == 'Darwin')){ reticulate::conda_install(envname = 'r-reticulate', packages = c('nomkl'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto") } } + +saveLoc <- tempfile("saveLoc") +dir.create(saveLoc) + + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # simulated data Tests #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - -data(plpDataSimulationProfile, envir = environment()) +data("plpDataSimulationProfile") # PLPDATA -sampleSize <- 2500+sample(1000,1) -plpData <- simulatePlpData(plpDataSimulationProfile, n = sampleSize) -#plpData$metaData$cohortId <- plpData$metaData$cohortIds +connectionDetails <- Eunomia::getEunomiaConnectionDetails() +Eunomia::createCohorts(connectionDetails) +outcomeId <- 3 # GIbleed + +databaseDetails <- createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cdmDatabaseName = "main", + cohortDatabaseSchema = "main", + cohortTable = "cohort", + outcomeDatabaseSchema = "main", + outcomeTable = "cohort", + targetId = 1, + outcomeIds = outcomeId, + cdmVersion = 5) + +covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsAge = TRUE, + useDemographicsGender = TRUE, + useConditionOccurrenceAnyTimePrior = TRUE +) + +plpData <- getPlpData(databaseDetails = databaseDetails, + covariateSettings = covariateSettings, + restrictPlpDataSettings = createRestrictPlpDataSettings()) # POPULATION populationSettings <- createStudyPopulationSettings( @@ -53,7 +72,7 @@ populationSettings <- createStudyPopulationSettings( removeSubjectsWithPriorOutcome = FALSE, priorOutcomeLookback = 99999, requireTimeAtRisk = T, - minTimeAtRisk=10, + minTimeAtRisk = 10, riskWindowStart = 0, startAnchor = 'cohort start', riskWindowEnd = 365, @@ -62,66 +81,111 @@ populationSettings <- createStudyPopulationSettings( # MODEL SETTINGS -lrSet <- setLassoLogisticRegression() +lrSet <- setLassoLogisticRegression(seed = 42) # RUNPLP - LASSO LR plpResult <- runPlp( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, analysisId = 'Test', analysisName = 'Testing analysis', populationSettings = populationSettings, - splitSettings = createDefaultSplitSetting(), + splitSettings = createDefaultSplitSetting(splitSeed = 12), preprocessSettings = createPreprocessSettings(), modelSettings = lrSet, - logSettings = createLogSettings(verbosity = 'TRACE'), + logSettings = createLogSettings(verbosity = 'ERROR'), executeSettings = createDefaultExecuteSettings(), saveDirectory = saveLoc ) + +# now diagnose +diagnoseResult <- diagnosePlp( + plpData = plpData, + outcomeId = outcomeId, + analysisId = 'Test', + populationSettings = populationSettings, + splitSettings = createDefaultSplitSetting(splitSeed = 12), + saveDirectory = saveLoc, + modelSettings = lrSet, + logSettings = createLogSettings( + verbosity = 'DEBUG', + timeStamp = T, + logName = 'diagnosePlp Log' + ), + preprocessSettings = createPreprocessSettings(), + sampleSettings = NULL, + featureEngineeringSettings = NULL +) + # population <- createStudyPopulation( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = populationSettings ) createTrainData <- function(plpData, population){ - trainData <- list() - trainData$covariateData <- Andromeda::copyAndromeda(plpData$covariateData) - attr(trainData$covariateData, "metaData") <- attr(plpData$covariateData, "metaData") - trainData$labels <- population - trainData$folds <- data.frame( - rowId = population$rowId, - index = sample(3, nrow(population), replace = T) - ) - - attr(trainData, "metaData")$outcomeId <- 2 - attr(trainData, "metaData")$cohortId <- 1 - - class(trainData$covariateData) <- 'CovariateData' - + data <- PatientLevelPrediction::splitData(plpData = plpData, + population = population, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12)) + trainData <- data$Train return(trainData) } +trainData <- createTrainData(plpData, population) -sampleSize2 <- 1000+sample(1000,1) -plpData2 <- simulatePlpData(plpDataSimulationProfile, n = sampleSize2) - -population2 <- createStudyPopulation( - plpData = plpData2, - outcomeId = 2, - populationSettings = populationSettings -) +createTestData <- function(plpData, population){ + data <- PatientLevelPrediction::splitData(plpData = plpData, + population = population, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12)) + testData <- data$Test + return(testData) +} -sampleSizeBig <- 10000 -plpDataBig <- simulatePlpData(plpDataSimulationProfile, n = sampleSizeBig) +testData <- createTestData(plpData, population) -populationBig <- createStudyPopulation( - plpData = plpDataBig, - outcomeId = 2, - populationSettings = populationSettings -) +# reduced trainData to only use n most important features (as decided by LR) +reduceTrainData <- function(trainData, n=20) { + covariates <- plpResult$model$covariateImportance %>% + dplyr::slice_max(order_by = abs(.data$covariateValue),n = n, with_ties = F) %>% + dplyr::pull(.data$covariateId) + + reducedTrainData <- list(labels = trainData$labels, + folds = trainData$folds, + covariateData = Andromeda::andromeda( + analysisRef = trainData$covariateData$analysisRef + )) + + + reducedTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(covariateId %in% covariates) + reducedTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(covariateId %in% covariates) + + attributes(reducedTrainData$covariateData)$metaData <- attributes(trainData$covariateData)$metaData + class(reducedTrainData$covariateData) <- class(trainData$covariateData) + attributes(reducedTrainData)$metaData <- attributes(trainData)$metaData + return(reducedTrainData) +} +tinyTrainData <- reduceTrainData(trainData) + +tinyPlpData <- createTinyPlpData(plpData, plpResult) + +nanoData <- createTinyPlpData(plpData, plpResult, n = 2) +tinyResults <- runPlp(plpData = nanoData, + populationSettings = populationSettings, + outcomeId = outcomeId, + analysisId = 'tinyFit', + executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = file.path(saveLoc, 'tinyResults')) diff --git a/tests/testthat/test-KNN.R b/tests/testthat/test-KNN.R new file mode 100644 index 000000000..23cc8486b --- /dev/null +++ b/tests/testthat/test-KNN.R @@ -0,0 +1,30 @@ + + +test_that('KNN fit works', { + skip_on_ci() + modelSettings = setKNN(k = 2) + nanoTrainData <- reduceTrainData(tinyTrainData, n = 2) + subjectToKeep <- nanoTrainData$labels[sample.int(nrow(nanoTrainData$labels), 50),"rowId"] + nanoTrainData$labels <- nanoTrainData$labels[nanoTrainData$labels$rowId %in% subjectToKeep,] + nanoTrainData$folds <- nanoTrainData$folds[nanoTrainData$folds$rowId %in% subjectToKeep,] + nanoTrainData$covariateData$covariates <- nanoTrainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% subjectToKeep) + plpModel <- fitPlp( + trainData = nanoTrainData, + modelSettings = modelSettings, + analysisId = 'KNN', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, nanoTrainData) + +}) + + +test_that("KNN settings", { +skip_on_ci() +model_set <- setKNN(k=5) +testthat::expect_is(model_set, "modelSettings") +testthat::expect_length(model_set,2) +testthat::expect_error(setKNN(k = 0)) +testthat::expect_error(setKNN(indexFolder = 2372)) +}) diff --git a/tests/testthat/test-KNN_updated.R b/tests/testthat/test-KNN_updated.R deleted file mode 100644 index 20bc9bda3..000000000 --- a/tests/testthat/test-KNN_updated.R +++ /dev/null @@ -1,44 +0,0 @@ - -resultNames <- c('executionSummary','model','prediction', 'performanceEvaluation', 'covariateSummary', 'analysisRef') - -plpResultKNN <- runPlp( - plpData = plpData, - outcomeId = 2, - analysisId = 'knnTest', - analysisName = 'Testing knn', - populationSettings = populationSettings, - splitSettings = createDefaultSplitSetting(), - preprocessSettings = createPreprocessSettings(), - modelSettings = setKNN(k=10), - logSettings = createLogSettings(verbosity = 'TRACE'), - executeSettings = createDefaultExecuteSettings(), - saveDirectory = file.path(saveLoc, 'knn') - ) - -test_that("covRef is correct size", { - - testthat::expect_true(nrow(as.data.frame(plpData$covariateData$covariateRef)) >= - nrow(plpResultKNN$model$covariateImportance)) - -}) - - -test_that("KNN results have correct structure", { - - - # same output names for LR, KNN and GBM - testthat::expect_equal( - names(plpResultKNN), - resultNames - ) - -}) - -test_that("KNN settings", { - -model_set <- setKNN(k=5) -testthat::expect_is(model_set, "modelSettings") -testthat::expect_length(model_set,2) -testthat::expect_error(setKNN(k = 0)) -testthat::expect_error(setKNN(indexFolder = 2372)) -}) diff --git a/tests/testthat/test-LightGBM.R b/tests/testthat/test-LightGBM.R new file mode 100644 index 000000000..35e742cfb --- /dev/null +++ b/tests/testthat/test-LightGBM.R @@ -0,0 +1,121 @@ +# Copyright 2023 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +library("testthat") + +context("LightGBM") + + +test_that("LightGBM settings work", { + + seed <- sample(10000000,1) + #===================================== + # checking Light GBM + #===================================== + lgbmSet <- setLightGBM( + nthread = 5, + earlyStopRound = 25, + numIterations = 10, + numLeaves = c(31, 20), + maxDepth = 5, + minDataInLeaf = 10, + learningRate = 0.1, + lambdaL1 = 0, + lambdaL2 =0, + scalePosWeight = 1, + isUnbalance = F, + seed = seed + ) + + expect_is(lgbmSet, 'modelSettings') + expect_equal(lgbmSet$fitFunction, 'fitRclassifier') + expect_is(lgbmSet$param, 'list') + + expect_equal(attr(lgbmSet$param, 'settings')$modelType, 'LightGBM') + expect_equal(attr(lgbmSet$param, 'settings')$seed, seed) + expect_equal(attr(lgbmSet$param, 'settings')$modelName, "LightGBM") + + expect_equal(attr(lgbmSet$param, 'settings')$threads, 5) + expect_equal(attr(lgbmSet$param, 'settings')$varImpRFunction, 'varImpLightGBM') + expect_equal(attr(lgbmSet$param, 'settings')$trainRFunction, 'fitLightGBM') + expect_equal(attr(lgbmSet$param, 'settings')$predictRFunction, 'predictLightGBM') + + expect_equal(length(lgbmSet$param),2) + + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$numIterations)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$numLeaves)))), 2) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$earlyStopRound)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$maxDepth)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$minDataInLeaf)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$learningRate)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$lambdaL1)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$lambdaL2)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$scalePosWeight)))), 1) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$isUnbalance)))), 1) + +}) + + +test_that("LightGBM settings expected errors", { + #===================================== + # checking Gradient Boosting Machine + #===================================== + + testthat::expect_error(setLightGBM(numIterations = -1)) + testthat::expect_error(setLightGBM(numLeaves = -1)) + testthat::expect_error(setLightGBM(numLeaves = 10000000)) + testthat::expect_error(setLightGBM(learningRate = -2)) + testthat::expect_error(setLightGBM(seed = 'F')) + testthat::expect_error(setLightGBM(lambdaL1 = -1)) + testthat::expect_error(setLightGBM(lambdaL2 = -1)) + testthat::expect_error(setLightGBM(scalePosWeight = -1)) + testthat::expect_error(setLightGBM(isUnbalance = TRUE, scalePosWeight = 0.5)) + +}) + + + + +test_that("LightGBM working checks", { + + modelSettings <- setLightGBM(numIterations = 10, maxDepth = 3, learningRate = 0.1, numLeaves = 31, minDataInLeaf = 10, lambdaL1 = 0, lambdaL2 = 0) + + fitModel <- fitPlp( + trainData = trainData, + modelSettings = modelSettings, + analysisId = 'lgbmTest', + analysisPath = tempdir() + ) + + expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) + expect_equal(length(unique(fitModel$prediction$evaluationType)),2) + + # check prediction between 0 and 1 + expect_gte(min(fitModel$prediction$value), 0) + expect_lte(max(fitModel$prediction$value), 1) + + expect_equal(class(fitModel$model), c("lgb.Booster", "R6")) + + expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) + + expect_equal(fitModel$modelDesign$outcomeId, outcomeId) + expect_equal(fitModel$modelDesign$targetId, 1) + # TODO check other model design values? + + # test that at least some features have importances that are not zero + expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE) + +}) diff --git a/tests/testthat/test-PredictionDistribution_updated.R b/tests/testthat/test-PredictionDistribution.R similarity index 100% rename from tests/testthat/test-PredictionDistribution_updated.R rename to tests/testthat/test-PredictionDistribution.R diff --git a/tests/testthat/test-ThresholdSummary_updated.R b/tests/testthat/test-ThresholdSummary.R similarity index 100% rename from tests/testthat/test-ThresholdSummary_updated.R rename to tests/testthat/test-ThresholdSummary.R diff --git a/tests/testthat/test-UploadPlpDbResults_upadted.R b/tests/testthat/test-UploadPlpDbResults_upadted.R deleted file mode 100644 index f1ed72629..000000000 --- a/tests/testthat/test-UploadPlpDbResults_upadted.R +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -library("testthat") - -context("UploadPlpDbResults") - -cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") -ohdsiDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA") -connectionRedshift <- DatabaseConnector::createConnectionDetails( - dbms = "postgresql", - user = Sys.getenv("CDM5_POSTGRESQL_USER"), - password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")), - server = Sys.getenv("CDM5_POSTGRESQL_SERVER") - ) -conn <- DatabaseConnector::connect(connectionRedshift) - - -randVar <- rawToChar(as.raw(sample(c(65:90,97:122), 5, replace=T))) -appendRandom <- function(x, rand = randVar){ - return(paste(rand, x, sep='')) -} - - -test_that("database creation", { - - createPlpResultTables(conn = conn, - resultSchema = ohdsiDatabaseSchema, - targetDialect = 'postgresql', - deleteExistingTables = T, - createTables = T, - stringAppendToTables = appendRandom('test')) - - tableNames <- DatabaseConnector::getTableNames(connection = conn, databaseSchema = ohdsiDatabaseSchema) - # check the results table is created - testthat::expect_true(paste0(toupper(appendRandom('test')),'_RESULTS') %in% tableNames) - -}) - - -test_that("results uploaded to database", { - - resultsLoc <- file.path(saveLoc,'dbUp') - - savePlpResult(plpResult, file.path(resultsLoc, 'Analysis_1','plpResult')) - # save validation - if(!dir.exists(file.path(resultsLoc,'Validation','test', 'Analysis_1'))){ - dir.create(file.path(resultsLoc,'Validation','test', 'Analysis_1'), recursive = T) - } - plpResult$model$validationDetails <- list( - cohortId = 1, - outcomeId = 3, - populationSettings = plpResult$model$settings$populationSettings, - plpDataSettings = plpResult$model$settings$plpDataSettings - ) - savePlpResult(plpResult, file.path(resultsLoc,'Validation','test', 'Analysis_1', 'validationResult')) - - # add results: - populatePlpResultTables(conn = conn, - resultSchema = ohdsiDatabaseSchema, - stringAppendToTables = appendRandom('test'), - targetDialect = 'postgresql', - studyJsonList = list(list(cohort_name = 'blank1', cohort_id = 1, cohort_json = 'bla'), - list(cohort_name = 'blank2', cohort_id = 2, cohort_json = 'bla'), - list(cohort_name = 'blank3', cohort_id = 3, cohort_json = 'bla')), - studyName = 'test', - studyDescription = 'testing', - researcherName = 'jane doe', - researcherEmail = 'none', - researcherOrg = 'none', - databaseName = 'test', - databaseAcronym = 'test', - databaseVersion = 1, - databaseDescription = 'test', - databaseType = 'claims', - valDatabases = list(test = list(name = 'test', - description = 'test', - version = 1, - type = 'claims')), - resultLocation = resultsLoc, - resultPattern = 'Analysis', - validationLocation = file.path(resultsLoc,'Validation'), - addInternalValidation = T, - addExternalValidation = T, - gsubVal = NULL, - removePattern = NULL - ) - - - # check the results table is populated - sql <- 'select count(*) as N from @resultSchema.@appendresults;' - sql <- SqlRender::render(sql, resultSchema = ohdsiDatabaseSchema, append = appendRandom('test_')) - res <- DatabaseConnector::querySql(conn, sql) - testthat::expect_true(res$N[1]>0) - - -}) - - -test_that("database deletion", { - - createPlpResultTables(conn = conn, - resultSchema = ohdsiDatabaseSchema, - targetDialect = 'postgresql', - deleteExistingTables = T, - createTables = F, - stringAppendToTables = appendRandom('test')) - - tableNames <- DatabaseConnector::getTableNames(connection = conn, databaseSchema = ohdsiDatabaseSchema) - # check the results table is then deleted - testthat::expect_false(paste0(toupper(appendRandom('test')),'_RESULTS') %in% tableNames) - - -}) - -# disconnect -DatabaseConnector::disconnect(conn) diff --git a/tests/testthat/test-UploadToDatabase.R b/tests/testthat/test-UploadToDatabase.R new file mode 100644 index 000000000..908ab44d3 --- /dev/null +++ b/tests/testthat/test-UploadToDatabase.R @@ -0,0 +1,458 @@ +# Copyright 2021 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +library("testthat") + +context("UploadToDatabase") + +# only run this during CI +if (Sys.getenv('CI') == 'true') { +cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") +ohdsiDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA") +connectionRedshift <- DatabaseConnector::createConnectionDetails( + dbms = "postgresql", + user = Sys.getenv("CDM5_POSTGRESQL_USER"), + password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")), + server = Sys.getenv("CDM5_POSTGRESQL_SERVER"), + ) +conn <- DatabaseConnector::connect(connectionRedshift) +targetDialect <- 'postgresql' + +set.seed(NULL) +randVar <- rawToChar(as.raw(sample(c(65:90,97:122), 5, replace=T))) + +appendRandom <- function(x, rand = randVar){ + return(paste("plp", rand, x, sep='')) +} + +} +test_that("test createDatabaseSchemaSettings works", { + skip_if(Sys.getenv('CI') != 'true', 'not run locally') + databaseSchemaSettings <- createDatabaseSchemaSettings( + resultSchema = ohdsiDatabaseSchema, + tablePrefix = '', + targetDialect = targetDialect + ) + + # check inputs as expected + testthat::expect_true(databaseSchemaSettings$resultSchema == ohdsiDatabaseSchema) + testthat::expect_true(databaseSchemaSettings$tablePrefix == '') + testthat::expect_true(databaseSchemaSettings$targetDialect == targetDialect) + testthat::expect_true(databaseSchemaSettings$cohortDefinitionSchema == ohdsiDatabaseSchema) + testthat::expect_true(databaseSchemaSettings$databaseDefinitionSchema == ohdsiDatabaseSchema) + testthat::expect_true(databaseSchemaSettings$tablePrefixCohortDefinitionTables == '') + testthat::expect_true(databaseSchemaSettings$tablePrefixDatabaseDefinitionTables == '') + + databaseSchemaSettings <- createDatabaseSchemaSettings( + resultSchema = ohdsiDatabaseSchema, + tablePrefix = '', + targetDialect = targetDialect, + cohortDefinitionSchema = 'test 123', + tablePrefixCohortDefinitionTables = 'a', + databaseDefinitionSchema = 'test234', + tablePrefixDatabaseDefinitionTables = 'b' + ) + + testthat::expect_true(databaseSchemaSettings$cohortDefinitionSchema == 'test 123') + testthat::expect_true(databaseSchemaSettings$databaseDefinitionSchema == 'test234') + testthat::expect_true(databaseSchemaSettings$tablePrefixCohortDefinitionTables == 'A_') + testthat::expect_true(databaseSchemaSettings$tablePrefixDatabaseDefinitionTables == 'B_') + + + testthat::expect_true(class(databaseSchemaSettings) == 'plpDatabaseResultSchema') + +} +) + + +test_that("test createDatabaseDetails works", { + + databaseList <- createDatabaseList( + cdmDatabaseSchemas = paste0('database', 1:5) + ) + + testthat::expect_true(length(databaseList) == length(paste0('database', 1:5))) + testthat::expect_true(class(databaseList) == 'list') + testthat::expect_true(!is.null(databaseList$database1$databaseDetails)) + testthat::expect_true(!is.null(databaseList$database1$databaseMetaData)) + + testthat::expect_equal( + databaseList$database1$databaseDetails$databaseMetaDataId, + databaseList$database1$databaseMetaData$databaseId + ) + +} +) + + +test_that("database creation", { + skip_if(Sys.getenv('CI') != 'true', 'not run locally') + createPlpResultTables( + connectionDetails = connectionRedshift, + resultSchema = ohdsiDatabaseSchema, + targetDialect = targetDialect, + deleteTables = T, + createTables = T, + tablePrefix = appendRandom('test') + ) + + # check the results table is created + testthat::expect_true(DatabaseConnector::existsTable( + connection = conn, + databaseSchema = ohdsiDatabaseSchema, + tableName = paste0(appendRandom('test'),'_PERFORMANCES') + )) + +}) + + +test_that("results uploaded to database", { + skip_if(Sys.getenv('CI') != 'true', 'not run locally') + resultsLoc <- file.path(saveLoc,'dbUp') + + plpResult$model$trainDetails$developmentDatabase <- 'test' + savePlpResult(plpResult, file.path(resultsLoc, 'Analysis_1','plpResult')) + # save validation + if(!dir.exists(file.path(resultsLoc,'Validation','test', 'Analysis_1'))){ + dir.create(file.path(resultsLoc,'Validation','test', 'Analysis_1'), recursive = T) + } + plpResult$model$validationDetails <- list( + targetId = 1, + outcomeId = outcomeId, + developmentDatabase = 'test', + validationDatabase = 'test', + populationSettings = plpResult$model$modelDesign$populationSettings, + restrictPlpDataSettings = plpResult$model$modelDesign$restrictPlpDataSettings + ) + savePlpResult(plpResult, file.path(resultsLoc,'Validation','test', 'Analysis_1', 'validationResult')) + + # add results: + addMultipleRunPlpToDatabase( + connectionDetails = connectionRedshift, + databaseSchemaSettings = createDatabaseSchemaSettings( + resultSchema = ohdsiDatabaseSchema, + tablePrefix = appendRandom('test'), + targetDialect = targetDialect + ), + cohortDefinitions = data.frame( + cohortName = c('blank1','blank2','blank3'), + cohortId = c(1,2,3), + json = rep('bla',3) + ), + databaseList = createDatabaseList( + cdmDatabaseSchemas = c('test') + ), + resultLocation = resultsLoc, + modelSaveLocation = file.path(saveLoc,'modelLocation') # new + ) + + # check the results table is populated + sql <- 'select count(*) as N from @resultSchema.@appendperformances;' + sql <- SqlRender::render(sql, resultSchema = ohdsiDatabaseSchema, append = appendRandom('test_')) + res <- DatabaseConnector::querySql(conn, sql) + testthat::expect_true(res$N[1]>0) + + # add test: check model location has result? + +}) + +test_that("database deletion", { + skip_if(Sys.getenv('CI') != 'true', 'not run locally') + createPlpResultTables( + connectionDetails = connectionRedshift, + resultSchema = ohdsiDatabaseSchema, + targetDialect = targetDialect, + deleteTables = T, + createTables = F, + tablePrefix = appendRandom('test') + ) + + # check the results table is then deleted + testthat::expect_false(DatabaseConnector::existsTable( + connection = conn, + databaseSchema = ohdsiDatabaseSchema, + tableName = paste0(appendRandom('test'),'_PERFORMANCES') + )) + +}) + +# disconnect +if (Sys.getenv('CI') == 'true') { + DatabaseConnector::disconnect(conn) +} + +# code to test sqlite creation, result and diagnostic upload all in one +test_that("temporary sqlite with results works", { + + resultsLoc <- file.path(saveLoc,'sqliteTest') + + savePlpResult(plpResult, file.path(resultsLoc, 'Analysis_1','plpResult')) + # save diagnostic + saveRDS(diagnoseResult, file.path(resultsLoc,'Analysis_1','diagnosePlp.rds')) + + sqliteLocation <- insertResultsToSqlite( + resultLocation = resultsLoc, + cohortDefinitions = data.frame( + cohortName = c('blank1','blank2','blank3'), + cohortId = c(1,2,3), + json = rep('bla',3) + ), + databaseList = createDatabaseList( + cdmDatabaseSchemas = c('test') + ), + sqliteLocation = file.path(resultsLoc, 'sqlite') + ) + + # expect the database to exist + testthat::expect_true(file.exists(sqliteLocation)) + + cdmDatabaseSchema <- 'main' + ohdsiDatabaseSchema <- 'main' + connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = sqliteLocation + ) + conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) + targetDialect <- 'sqlite' + + # check the results table is populated + sql <- 'select count(*) as N from main.performances;' + res <- DatabaseConnector::querySql(conn, sql) + testthat::expect_true(res$N[1]>0) + + # check the diagnostic table is populated + sql <- 'select count(*) as N from main.diagnostics;' + res <- DatabaseConnector::querySql(conn, sql) + testthat::expect_true(res$N[1]>0) + + # disconnect + DatabaseConnector::disconnect(conn) + +}) + +# SQL lite test +test_that("temporary sqlite with results works", { + + externalVal <- plpResult + externalVal$model$model <- 'none' + externalVal$model$trainDetails <- NULL + externalVal$model$validationDetails <- list( + targetId = 1, + outcomeId = 3, + developmentDatabase = 'test', + validationDatabase = 'test', + populationSettings = plpResult$model$modelDesign$populationSettings, + restrictPlpDataSettings = plpResult$model$modelDesign$restrictPlpDataSettings + ) + +sqliteLocation <- insertRunPlpToSqlite( + runPlp = plpResult, + externalValidatePlp = NULL + ) + +# expect the database to exist +testthat::expect_true(file.exists(sqliteLocation)) + +cdmDatabaseSchema <- 'main' +ohdsiDatabaseSchema <- 'main' +connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = sqliteLocation +) +conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) +targetDialect <- 'sqlite' + +# check the results table is populated +sql <- 'select count(*) as N from main.performances;' +res <- DatabaseConnector::querySql(conn, sql) +testthat::expect_true(res$N[1]>0) + + +# check export to csv +extractDatabaseToCsv( + connectionDetails = connectionDetails, + databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'), + csvFolder = file.path(saveLoc, 'csvFolder') +) + +testthat::expect_true(dir.exists(file.path(saveLoc, 'csvFolder'))) +testthat::expect_true(length(dir(file.path(saveLoc, 'csvFolder'))) > 0 ) +testthat::expect_true(dir.exists(file.path(saveLoc, 'csvFolder', 'models'))) # new +testthat::expect_true(length(dir(file.path(saveLoc, 'csvFolder', 'models'))) > 0 ) # new +# disconnect +DatabaseConnector::disconnect(conn) + + +}) + +# importFromCsv test here as can use previous csv saving +test_that("import from csv", { + + cohortDef <- extractCohortDefinitionsCSV( + csvFolder = file.path(saveLoc, 'csvFolder') + ) + testthat::expect_true(inherits(cohortDef, 'data.frame')) + testthat::expect_true(ncol(cohortDef) == 4) + + databaseList <- extractDatabaseListCSV( + csvFolder = file.path(saveLoc, 'csvFolder') + ) + testthat::expect_true(inherits(databaseList, 'list')) + testthat::expect_true(!is.null(databaseList[[1]]$databaseDetails)) + testthat::expect_true(!is.null(databaseList[[1]]$databaseMetaData)) + + # model designs work + modeldesignsRow <- data.frame( + target_id = 1, outcome_id = 2, population_setting_id = 1, + plp_data_setting_id = 1, model_setting_id = 1, + covariate_setting_id = 1, sample_setting_id = 1, + split_setting_id = 1, feature_engineering_setting_id =1 , + tidy_covariates_setting_id = 1 + ) + res <- getModelDesignSettingTable(modeldesignsRow) + # expect res to be a data.frame, check values? + testthat::expect_true(inherits(res, 'data.frame')) + + modelDesign <- getModelDesignCsv( + modelDesignSettingTable = res, + csvFolder = file.path(saveLoc, 'csvFolder') + ) + testthat::expect_true(inherits(modelDesign, 'modelDesign')) + + # performance works + res <- getPerformanceEvaluationCsv( + performanceId = 1, + csvFolder = file.path(saveLoc, 'csvFolder') + ) + testthat::expect_true(inherits(res, 'list')) + testthat::expect_true( + sum(names(res) %in% + c('evaluationStatistics', 'thresholdSummary', + 'calibrationSummary', 'demographicSummary', + 'predictionDistribution' + ) + ) == 5 + ) + + + # test object extracts + obj <- extractObjectFromCsv( + performanceId = 1, + csvFolder = file.path(saveLoc, 'csvFolder') + ) + testthat::expect_true(inherits(obj, 'externalValidatePlp') | inherits(obj, 'runPlp')) + + # test diagnostic extracted + diag <- extractDiagnosticFromCsv( + diagnosticId = 1, + csvFolder = file.path(saveLoc, 'csvFolder') + ) + testthat::expect_true(inherits(diag, 'diagnosePlp') | is.null(diag)) + + + + # Testing everything together + csvServerLoc <- file.path(tempdir(), 'newCsvDatabase') + if(!dir.exists(file.path(tempdir(), 'newCsvDatabase'))){ + dir.create(file.path(tempdir(), 'newCsvDatabase'), recursive = T) + } + newResultConnDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = file.path(csvServerLoc,'newCsv.sqlite') + ) + newResultConn <- DatabaseConnector::connect(newResultConnDetails) + csvDatabaseSchemaSettings <- PatientLevelPrediction::createDatabaseSchemaSettings( + resultSchema = 'main', + tablePrefix = '', + targetDialect = 'sqlite', + tempEmulationSchema = NULL + ) + + # create empty tables to insert csv into + PatientLevelPrediction::createPlpResultTables( + connectionDetails = newResultConnDetails, + targetDialect = 'sqlite', + resultSchema = 'main', + createTables = T, + deleteTables = T, + tablePrefix = '', + tempEmulationSchema = NULL + ) + + res <- insertCsvToDatabase( + csvFolder = file.path(saveLoc, 'csvFolder'), + connectionDetails = newResultConnDetails, + databaseSchemaSettings = csvDatabaseSchemaSettings, + modelSaveLocation = file.path(csvServerLoc,'models'), + csvTableAppend = '' + ) + testthat::expect_true(res) + + # check some of the tables + + +}) + + +# new - check null model just reports message +test_that("message if model is null", { + + model2 <- list(noModel = T) + attr(model2, "predictionFunction") <- 'noModel' + attr(model2, "saveType") <- 'RtoJson' + class(model2) <- 'plpModel' + + plpResult2 <- plpResult + plpResult2$model <- model2 + + savePlpResult(plpResult2, file.path(tempdir(), 'null_model', 'Analysis_1', 'plpResult')) + + nullModelServerLoc <- file.path(tempdir(), 'nullModelDatabase') + if(!dir.exists(file.path(tempdir(), 'nullModelDatabase'))){ + dir.create(file.path(tempdir(), 'nullModelDatabase'), recursive = T) + } + nullModelResultConnDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'sqlite', + server = file.path(nullModelServerLoc,'sqlite.sqlite') + ) + nullModelDatabaseSchemaSettings <- createDatabaseSchemaSettings( + resultSchema = 'main', + tablePrefix = '', + targetDialect = 'sqlite', + tempEmulationSchema = NULL + ) + + createPlpResultTables( + connectionDetails = nullModelResultConnDetails, + targetDialect = 'sqlite', + resultSchema = 'main', + deleteTables = T, + createTables = T, + tablePrefix = '' + ) + + testthat::expect_message( + addMultipleRunPlpToDatabase( + connectionDetails = nullModelResultConnDetails, + databaseSchemaSettings = nullModelDatabaseSchemaSettings, + resultLocation = file.path(tempdir(), 'null_model'), + modelSaveLocation = file.path(tempdir(), 'null_model', 'models') + ) + ) + +}) + + diff --git a/tests/testthat/test-andromedahelperfunctions_updated.R b/tests/testthat/test-andromedahelperfunctions.R similarity index 90% rename from tests/testthat/test-andromedahelperfunctions_updated.R rename to tests/testthat/test-andromedahelperfunctions.R index c11072f75..d63bb397d 100644 --- a/tests/testthat/test-andromedahelperfunctions_updated.R +++ b/tests/testthat/test-andromedahelperfunctions.R @@ -23,8 +23,8 @@ context("AndromedaHelperFunctions") # batcheRestrict test test_that("batchRestrict", { - metaData <- attr(plpDataBig$covariateData, 'metaData') - covariateData <- PatientLevelPrediction:::batchRestrict(plpDataBig$covariateData, populationBig, sizeN = 10000000) + metaData <- attr(plpData$covariateData, 'metaData') + covariateData <- PatientLevelPrediction:::batchRestrict(plpData$covariateData, population, sizeN = 1000000) testthat::expect_is(covariateData, 'CovariateData') expect_equal(names(metaData), names(attr(covariateData, 'metaData'))) diff --git a/tests/testthat/test-covariateExtras.R b/tests/testthat/test-covariateExtras.R index fa2f960a5..17c9e04ec 100644 --- a/tests/testthat/test-covariateExtras.R +++ b/tests/testthat/test-covariateExtras.R @@ -18,10 +18,6 @@ library("testthat") context("CovariateExtras") -connectionDetails <- Eunomia::getEunomiaConnectionDetails() -Eunomia::createCohorts(connectionDetails) - - test_that("settings creation", { covSet <- createCohortCovariateSettings( @@ -113,14 +109,14 @@ covs <- FeatureExtraction::getDbCovariateData( cohortTable = "cohort", cohortDatabaseSchema = "main", cohortTableIsTemp = F, - cohortId = 1, + cohortIds = c(1), rowIdField = 'rowId', covariateSettings = covSet, aggregated = F ) expect_equal(1, covs$covariateRef %>% dplyr::tally() %>% dplyr::pull()) -expect_equal(as.double(covs$covariateRef %>% dplyr::select(.data$covariateId) %>% dplyr::collect()), covSet$covariateId) +expect_equal(as.double(covs$covariateRef %>% dplyr::select("covariateId") %>% dplyr::collect()), covSet$covariateId) expect_true(covs$covariates %>% dplyr::tally() %>% dplyr::pull() > 0) }) diff --git a/tests/testthat/test-cyclopsModels_updated.R b/tests/testthat/test-cyclopsModels.R similarity index 97% rename from tests/testthat/test-cyclopsModels_updated.R rename to tests/testthat/test-cyclopsModels.R index f452f7ce4..0bbf6d779 100644 --- a/tests/testthat/test-cyclopsModels_updated.R +++ b/tests/testthat/test-cyclopsModels.R @@ -239,18 +239,20 @@ testthat::expect_error(setIterativeHardThresholding(seed = 'F')) test_that("test logistic regression runs", { modelSettings <- setLassoLogisticRegression() -trainData <- createTrainData(plpData, population) fitModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, search = "grid", - analysisId = 'lrTest' + analysisId = 'lrTest', + analysisPath = tempdir() ) expect_equal(length(unique(fitModel$prediction$evaluationType)),2) expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) expect_true(length(fitModel$model$coefficients) < trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()+1) + + expect_true(!is.null(fitModel$trainDetails$trainingTime)) expect_equal(fitModel$trainDetails$trainingDate,Sys.Date()) @@ -261,7 +263,7 @@ expect_equal( expect_true('covariateValue' %in% colnames(fitModel$covariateImportance)) -expect_equal(fitModel$trainDetails$outcomeId, attr(trainData, 'metaData')$outcomeId) -expect_equal(fitModel$trainDetails$cohortId, attr(trainData, 'metaData')$cohortId) +expect_equal(fitModel$modelDesign$outcomeId, attr(trainData, 'metaData')$outcomeId) +expect_equal(fitModel$modelDesign$targetId, attr(trainData, 'metaData')$targetId) }) diff --git a/tests/testthat/test-dataSplitting_updated.R b/tests/testthat/test-dataSplitting.R similarity index 97% rename from tests/testthat/test-dataSplitting_updated.R rename to tests/testthat/test-dataSplitting.R index 158c6ca25..b8ce628bb 100644 --- a/tests/testthat/test-dataSplitting_updated.R +++ b/tests/testthat/test-dataSplitting.R @@ -22,9 +22,9 @@ context("Data splitting") populationT <- plpData$cohorts populationT$outcomeCount <- sample(c(0,1), nrow(populationT), replace = T) -attr(populationT, "metaData")$outcomeId <- 2 +attr(populationT, "metaData")$outcomeId <- outcomeId attr(populationT, "metaData")$populationSettings <- list(madeup = T) -attr(populationT, "metaData")$plpDataSettings <- list(madeup = T) +attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = T) attr(populationT, "metaData")$attrition <- c(1,2,3) # check correct inputs @@ -161,13 +161,13 @@ test_that("Main split function: splitData", { # check attributes for Train expect_equal(attr(splitData$Train, "metaData")$outcomeId, attr(populationT, "metaData")$outcomeId) - expect_equal(attr(splitData$Train, "metaData")$cohortId, plpData$metaData$databaseDetails$cohortId) + expect_equal(attr(splitData$Train, "metaData")$targetId, plpData$metaData$databaseDetails$targetId) expect_equal( attr(splitData$Train, "metaData")$cdmDatabaseSchema, plpData$metaData$databaseDetails$cdmDatabaseSchema ) - expect_is(attr(splitData$Train, "metaData")$plpDataSettings, 'list') + expect_is(attr(splitData$Train, "metaData")$restrictPlpDataSettings, 'list') expect_equal( attr(splitData$Train, "metaData")$covariateSettings, plpData$metaData$covariateSettings diff --git a/tests/testthat/test-demographicSummary_updated.R b/tests/testthat/test-demographicSummary.R similarity index 100% rename from tests/testthat/test-demographicSummary_updated.R rename to tests/testthat/test-demographicSummary.R diff --git a/tests/testthat/test-diagnostic.R b/tests/testthat/test-diagnostic.R new file mode 100644 index 000000000..2c261dd16 --- /dev/null +++ b/tests/testthat/test-diagnostic.R @@ -0,0 +1,166 @@ +# Copyright 2021 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +context("Diagnostic") + + +test_that("getMaxEndDaysFromCovariates works", { + + covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + endDays = -1 + ) + testthat::expect_equal(getMaxEndDaysFromCovariates(covariateSettings), -1) + + covariateSettings <- list( + FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + endDays = -1 + ), + FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + endDays = 2 + ) + ) + testthat::expect_equal(getMaxEndDaysFromCovariates(covariateSettings), 2) + + covariateSettings <- list( + FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + endDays = -1, + ), + PatientLevelPrediction::createCohortCovariateSettings( + endDay = 5, + settingId = 1, + cohortName = 'test', + cohortId = 1, + analysisId = 111, + cohortDatabaseSchema = '', cohortTable = '') + ) + testthat::expect_equal(getMaxEndDaysFromCovariates(covariateSettings), 5) + + # if no covariate setting has endDays return 0 + testthat::expect_equal( + getMaxEndDaysFromCovariates(list(empty = list(gfg=2), empty2 = list(ff=1))), + 0 + ) + +}) + +test_that("test diagnosePlp works", { + test <- diagnosePlp( + plpData = tinyPlpData, + outcomeId = outcomeId, + analysisId = 'diagnoseTest', + populationSettings = createStudyPopulationSettings( + riskWindowStart = 1, + startAnchor = 'cohort start', + riskWindowEnd = 365, + endAnchor = 'cohort start' + ), + splitSettings = createDefaultSplitSetting(), + sampleSettings = createSampleSettings(), # default none + saveDirectory = file.path(saveLoc, 'diagnostics'), + featureEngineeringSettings = createFeatureEngineeringSettings(), # default none + modelSettings = setLassoLogisticRegression(), # default to logistic regression + preprocessSettings = createPreprocessSettings() + ) + #check results are a list + testthat::expect_is(test, 'diagnosePlp') + + # check list names + testthat::expect_equal( + sum( names(test) %in% + c('summary','participants','predictors', + 'outcomes', 'designs', 'modelDesign', + 'databaseSchema') + ) + , 7) + + # check the results are saved into the databaseName directory + testthat::expect_equal(T, dir.exists(file.path(saveLoc, 'diagnostics'))) + testthat::expect_equal(T, file.exists(file.path(saveLoc, 'diagnostics','diagnoseTest','diagnosePlp.rds'))) + + testthat::expect_is(test$summary, 'data.frame') + testthat::expect_is(test$participants, 'data.frame') + testthat::expect_is(test$predictors, 'data.frame') # rename this outcome survival? + testthat::expect_is(test$outcomes, 'data.frame') + testthat::expect_is(test$databaseSchema, 'character') + + testthat::expect_true(!is.null(test$modelDesign$targetId)) + testthat::expect_true(!is.null(test$modelDesign$outcomeId)) + testthat::expect_true(!is.null(test$modelDesign$restrictPlpDataSettings)) + testthat::expect_true(!is.null(test$modelDesign$covariateSettings)) + testthat::expect_true(!is.null(test$modelDesign$populationSettings)) + + +}) + + +test_that("test diagnoseMultiplePlp works", { + + analysis1 <- createModelDesign( + targetId = 1, + outcomeId = outcomeId, + restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, + washoutPeriod = 0, + sampleSize = 100), + populationSettings = createStudyPopulationSettings(), + covariateSettings = covariateSettings, + featureEngineeringSettings = NULL, + sampleSettings = NULL, + splitSettings = createDefaultSplitSetting(), + preprocessSettings = createPreprocessSettings(), + modelSettings = setLassoLogisticRegression(seed = 12) + ) + + analysis2 <- createModelDesign( + targetId = 1, + outcomeId = outcomeId, + restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, + washoutPeriod = 0, + sampleSize = 100), + populationSettings = createStudyPopulationSettings(washoutPeriod = 400), + covariateSettings = covariateSettings, + featureEngineeringSettings = NULL, + sampleSettings = NULL, + splitSettings = createDefaultSplitSetting(), + preprocessSettings = createPreprocessSettings(), + modelSettings = setLassoLogisticRegression(seed = 12) + ) + + diagnoseMultiplePlp( + databaseDetails = databaseDetails, + modelDesignList = list( + analysis1, + analysis2 + ), + cohortDefinitions = data.frame( + cohortId = c(1, outcomeId), + cohortName = c('target', 'outcome') + ), + saveDirectory = file.path(saveLoc, 'diagnosticsMultiple') + ) + + # file.path(saveDirectory,'settings.csv') exits + testthat::expect_true(file.exists(file.path(saveLoc, 'diagnosticsMultiple', 'settings.csv'))) + + # file.path(saveDirectory, settings$analysisId, 'diagnosePlp.rds') exists + testthat::expect_true(length(dir(file.path(saveLoc, 'diagnosticsMultiple'), pattern = 'Analysis_')) == 2) + + testthat::expect_true(file.exists(file.path(saveLoc, 'diagnosticsMultiple', 'Analysis_1', 'diagnosePlp.rds'))) + +}) diff --git a/tests/testthat/test-diagnostic_updated.R b/tests/testthat/test-diagnostic_updated.R deleted file mode 100644 index 3a12a7b7a..000000000 --- a/tests/testthat/test-diagnostic_updated.R +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -context("Diagnostic") - -test_that("test code works when using plpData", { - test <- diagnostic( - plpData = plpData, - cdmDatabaseName = 'madeup', - cohortName = 'made up target', - outcomeNames = paste0('made up outcome', 1:2), - databaseDetails, - restrictPlpDataSettings, - populationSettings = createStudyPopulationSettings( - riskWindowStart = 1, - startAnchor = 'cohort start', - riskWindowEnd = 365, - endAnchor = 'cohort start' - ), - minCellCount = 5, - outputFolder = file.path(saveLoc, 'diagnostics') - ) - #check results are a list - testthat::expect_equal(class(test), 'list') - - # check list names - testthat::expect_equal(sum(names(test)%in%c('distribution','proportion','characterization')), 3) - - # check the results are saved into the databaseName directory - testthat::expect_equal(T, dir.exists(file.path(saveLoc, 'diagnostics'))) - - #check tar - - testthat::expect_equal(unique(test$proportion$TAR)[1], paste0('cohort start', ' + ', 1, ' days - ', - 'cohort start', ' + ', 365, ' days')) -}) diff --git a/tests/testthat/test-ensemble.R b/tests/testthat/test-ensemble.R deleted file mode 100644 index 5a092f332..000000000 --- a/tests/testthat/test-ensemble.R +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -context("Ensemble") - -if(F){ -ensemble <- runEnsembleModel(population = population, - dataList = list(plpData, plpData), - modelList = list(lrSet, gbmSet), # change to get rid of warning? - testSplit = "subject", - testFraction = 0.2, - stackerUseCV = T, - splitSeed = 1, - nfold = 3, - saveDirectory= saveLoc, - saveEnsemble = F, - savePlpData=F, - savePlpResult=F, - savePlpPlots = F, - saveEvaluation = F, - analysisId = 'ensemble', - verbosity = "INFO", - ensembleStrategy = "stacked") - -test_that("run ensemble model", { - - testthat::expect_s3_class(ensemble, 'ensemblePlp') - }) - -test_that("combine mean ensemble model works", { - comEn <- createEnsemble(runPlpList = list(plpResult,plpDataReal)) - testthat::expect_s3_class(comEn, 'ensemblePlp') -}) - -test_that("combine AUC ensemble model works", { - comEn <- createEnsemble(runPlpList = list(plpResult,plpDataReal), weighted = T) - testthat::expect_s3_class(comEn, 'ensemblePlp') -}) - -test_that("combine manual weights ensemble model works", { - comEn <- createEnsemble(runPlpList = list(plpResult,plpDataReal), weighted = T, weights = runif(2)) - testthat::expect_s3_class(comEn, 'ensemblePlp') -}) - -test_that("combine ensemble model fails when weights too long", { - testthat::expect_error(createEnsemble(runPlpList = list(plpResult,plpDataReal), weighted = T, weights = runif(3))) -}) - - -test_that("apply ensemble model", { - ensemblePerf <- applyEnsembleModel(population = population, - dataList = list(plpData,plpData), - ensembleModel = ensemble, - calculatePerformance = T) - - testthat::expect_equal(class(ensemblePerf), 'list') - testthat::expect_equal(sum(names(ensemblePerf)%in%c('prediction','performanceEvaluation')), 2) - testthat::expect_s3_class(ensemblePerf$prediction, 'data.frame') - testthat::expect_equal(class(ensemblePerf$performanceEvaluation), 'plpEvaluation') - -}) - - -test_that("save/load ensemble model", { - saveEnsemblePlpModel(ensembleModel = ensemble$model, dirPath = file.path(saveLoc, 'ensembleSave')) - - testthat::expect_equal(dir.exists(file.path(saveLoc,'ensembleSave/level1')), T) - testthat::expect_equal(dir.exists(file.path(saveLoc,'ensembleSave/level2')), T) - - ensembleModelLoad <- loadEnsemblePlpModel(file.path(saveLoc,'ensembleSave')) - testthat::expect_equal(names(ensemble$model), names(ensembleModelLoad)) - -}) - - -test_that("save/load ensemble result", { - saveEnsemblePlpResult(ensembleResult = ensemble, dirPath = file.path(saveLoc,'ensembleResult')) - - testthat::expect_equal(file.exists(file.path(saveLoc,'ensembleResult/performanceEvaluation.rds')), T) - testthat::expect_equal(file.exists(file.path(saveLoc,'ensembleResult/covariateSummary.rds')), T) - - ensembleLoad <- loadEnsemblePlpResult(file.path(saveLoc,'ensembleResult')) - testthat::expect_equal(names(ensemble), names(ensembleLoad)) - -}) - -} diff --git a/tests/testthat/test-evaluation_updated.R b/tests/testthat/test-evaluation.R similarity index 89% rename from tests/testthat/test-evaluation_updated.R rename to tests/testthat/test-evaluation.R index d7effdd51..6669aa488 100644 --- a/tests/testthat/test-evaluation_updated.R +++ b/tests/testthat/test-evaluation.R @@ -38,15 +38,15 @@ test_that("modelBasedConcordance", { }) test_that("evaluatePlp_survival", { - + N <- 100 plpResultSurvivalPred <- data.frame( - rowId = 1:300, - ageYear = sample(100, 300, replace = T), - gender = sample(c('8507','8532'), 300, replace = T), - outcomeCount = c(rep(1,40), rep(0,260)), - value = runif(300), - evaluationType = rep('Train', 300), - survivalTime = sample(2000, 300, replace = T) + rowId = 1:N, + ageYear = sample(100, N, replace = T), + gender = sample(c('8507','8532'), N, replace = T), + outcomeCount = c(rep(1,N*0.1), rep(0,N*0.9)), + value = runif(N, max=0.1), + evaluationType = rep('Train', N), + survivalTime = sample(2000, N, replace = T) ) attr(plpResultSurvivalPred, "metaData")$modelType <- 'survival' attr(plpResultSurvivalPred, 'metaData')$timepoint <- 365 @@ -139,6 +139,18 @@ test_that("Calibration metrics", { }) +test_that("E statistics binary", { + prediction <- data.frame( + value = c(seq(.1, .5, length.out = 5), NA, .2), + outcomeCount = c(0, 0, 0, 1, 1, 0, NA) + ) + EStatsBinary <- PatientLevelPrediction:::calculateEStatisticsBinary(prediction) + expect_equal( + EStatsBinary, + c(Eavg = .34, E90 = .56, Emax = .6) + ) +}) + # TODO: test pref scores # test computePreferenceScore(prediction) diff --git a/tests/testthat/test-extractData_updated.R b/tests/testthat/test-extractData.R similarity index 61% rename from tests/testthat/test-extractData_updated.R rename to tests/testthat/test-extractData.R index 13470af44..0e15eb37a 100644 --- a/tests/testthat/test-extractData_updated.R +++ b/tests/testthat/test-extractData.R @@ -17,15 +17,27 @@ context("extractPlp") test_that("summary.plpData", { - attr(plpData$outcomes, "metaData")$outcomeIds <- c(2,3) + attr(plpData$outcomes, "metaData")$outcomeIds <- c(outcomeId) sum <- summary.plpData(plpData) testthat::expect_equal(class(sum),'summary.plpData') }) test_that("getPlpData errors", { - testthat::expect_error(getPlpData(cohortId = NULL)) - testthat::expect_error(getPlpData(cohortId = c(1,2))) - testthat::expect_error(getPlpData(cohortId = 1, outcomeIds = NULL)) + testthat::expect_error( + getPlpData( + databaseDetails = list(targetId = NULL) + ) + ) + testthat::expect_error( + getPlpData( + databaseDetails = list(targetId = c(1,2)) + ) + ) + testthat::expect_error( + getPlpData( + databaseDetails = list(targetId = 1, outcomeIds = NULL) + ) + ) }) @@ -37,3 +49,13 @@ test_that("getCovariateData", { testthat::expect_error(getCovariateData()) }) +test_that("createDatabaseDetails with NULL cdmDatabaseId errors", { + testthat::expect_error(createDatabaseDetails( + connectionDetails = list(), + cdmDatabaseSchema = 'main', + cdmDatabaseId = NULL, + targetId = 1, + outcomeIds = outcomeId + )) +}) + diff --git a/tests/testthat/test-featureEngineering.R b/tests/testthat/test-featureEngineering.R new file mode 100644 index 000000000..8ae88f5ce --- /dev/null +++ b/tests/testthat/test-featureEngineering.R @@ -0,0 +1,292 @@ +# Copyright 2021 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +library("testthat") +context("FeatureEngineering") + + +testFEFun <- function(type = 'none'){ + + result <- createFeatureEngineeringSettings(type = type) + + return(result) +} + + +test_that("createFeatureEngineeringSettings correct class", { + + featureEngineeringSettings <- testFEFun() + + expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + + checkFun <- 'sameData' # this is the only option at the moment, edit this when more are added + expect_equal(attr(featureEngineeringSettings, "fun"), checkFun) + +}) + + +testUniFun <- function(k = 100){ + + result <- createUnivariateFeatureSelection(k = k) + + return(result) +} + + + +test_that("createUnivariateFeatureSelection correct class", { + k <- sample(1000,1) + featureEngineeringSettings <- testUniFun(k = k) + + expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + expect_equal(featureEngineeringSettings$k, k) + expect_equal(attr(featureEngineeringSettings, "fun"), 'univariateFeatureSelection') + + expect_error(testUniFun(k = 'ffdff')) + expect_error(testUniFun(k = NULL)) + expect_error(testUniFun(k = -1)) +}) + + +test_that("univariateFeatureSelection", { + + k <- 20+sample(10,1) + featureEngineeringSettings <- testUniFun(k = k) + newTrainData <- copyTrainData(trainData) + + trainDataCovariateSize <- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + + reducedTrainData <- univariateFeatureSelection( + trainData = newTrainData, + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsInclude = NULL + ) + + newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + expect_true(newDataCovariateSize <= trainDataCovariateSize) + + # expect k many covariates left + expect_equal(k,reducedTrainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) + +}) + + +test_that("createRandomForestFeatureSelection correct class", { + ntreesTest <- sample(1000,1) + maxDepthTest <- sample(20,1) + featureEngineeringSettings <- createRandomForestFeatureSelection( + ntrees = ntreesTest, + maxDepth = maxDepthTest + ) + + expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + expect_equal(featureEngineeringSettings$ntrees, ntreesTest) + expect_equal(featureEngineeringSettings$max_depth, maxDepthTest) + expect_equal(attr(featureEngineeringSettings, "fun"), 'randomForestFeatureSelection') + + # error due to params + expect_error( + createRandomForestFeatureSelection( + ntrees = -1, + maxDepth = maxDepthTest + ) + ) + + expect_error( + createRandomForestFeatureSelection( + ntrees = 'dfdfd', + maxDepth = maxDepthTest + ) + ) + + expect_error( + createRandomForestFeatureSelection( + ntrees = 50, + maxDepth = 'maxDepthTest' + ) + ) + + expect_error( + createRandomForestFeatureSelection( + ntrees = 50, + maxDepth = -1 + ) + ) + +}) + + +test_that("randomForestFeatureSelection", { + + ntreesTest <- sample(1000,1) + maxDepthTest <- sample(20,1) + featureEngineeringSettings <- createRandomForestFeatureSelection( + ntrees = ntreesTest, + maxDepth = maxDepthTest + ) + + newTrainData <- copyTrainData(trainData) + trainDataCovariateSize <- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + + reducedTrainData <- randomForestFeatureSelection( + trainData = newTrainData, + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsInclude = NULL + ) + + newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + expect_true(newDataCovariateSize < trainDataCovariateSize) + +}) + +test_that("featureSelection is applied on test_data", { + k <- 20 + featureEngineeringSettings <- testUniFun(k = k) + newTrainData <- copyTrainData(trainData) + newTrainData <- univariateFeatureSelection( + trainData = newTrainData, + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsInclude = NULL + ) + + modelSettings <- setLassoLogisticRegression() + + # added try catch due to model sometimes not fitting + plpModel <- tryCatch( + {fitPlp(newTrainData, modelSettings, analysisId='FE')}, + error = function(e){return(NULL)} + ) + + if(!is.null(plpModel)){ # if the model fit then check this + prediction <- predictPlp(plpModel, testData, population) + expect_true(attr(prediction, 'metaData')$featureEngineering) + } +}) + +test_that("createSplineSettings correct class", { + + featureEngineeringSettings <- createSplineSettings( + continousCovariateId = 12, + knots = 4 + ) + + expect_is(featureEngineeringSettings, 'featureEngineeringSettings') + expect_equal(featureEngineeringSettings$knots, 4) + expect_equal(featureEngineeringSettings$continousCovariateId, 12) + expect_equal(attr(featureEngineeringSettings, "fun"), 'splineCovariates') + + expect_error(createSplineSettings(knots = 'ffdff')) + expect_error(createSplineSettings(knots = NULL)) +}) + +test_that("createSplineSettings correct class", { + + knots <- 4 + featureEngineeringSettings <- createSplineSettings( + continousCovariateId = 12101, + knots = knots + ) + + trainData <- simulatePlpData(plpDataSimulationProfile, n = 200) + + N <- 50 + trainData$covariateData$covariates <- data.frame( + rowId = sample(trainData$cohorts$rowId, N), + covariateId = rep(12101, N), + covariateValue = sample(10, N, replace = T) + ) + + trainData$covariateData$analysisRef <- data.frame( + analysisId = 101, + analysisName = 'cond', + domainId = 'madeup', + startDay = 0, + endDay = 0, + isBinary = 'N', + missingMeansZero = 'N' + ) + + trainData$covariateData$covariateRef <- data.frame( + covariateId = 12101, + covariateName = 'test', + analysisId = 101, + conceptId = 1 + ) + +newData <- splineCovariates( + trainData = trainData, + featureEngineeringSettings = featureEngineeringSettings +) + +testthat::expect_true(1 < nrow(as.data.frame(newData$covariateData$analysisRef))) +testthat::expect_true((knots+1) == nrow(as.data.frame(newData$covariateData$covariateRef))) +testthat::expect_true((knots+1) == length(table(as.data.frame(newData$covariateData$covariates)$covariateId))) + +}) + + +test_that("createStratifiedImputationSettings correct class", { + + featureEngineeringSettings <- createStratifiedImputationSettings( + covariateId = 12101, + ageSplits = c(20,50,70) + ) + + trainData <- simulatePlpData(plpDataSimulationProfile, n = 200) + + N <- 50 + trainData$covariateData$covariates <- data.frame( + rowId = sample(trainData$cohorts$rowId, N), + covariateId = rep(12101, N), + covariateValue = sample(10, N, replace = T) + ) + + trainData$covariateData$analysisRef <- data.frame( + analysisId = 101, + analysisName = 'cond', + domainId = 'madeup', + startDay = 0, + endDay = 0, + isBinary = 'N', + missingMeansZero = 'N' + ) + + trainData$covariateData$covariateRef <- data.frame( + covariateId = 12101, + covariateName = 'test', + analysisId = 101, + conceptId = 1 + ) + + stratifiedMeans <- calculateStratifiedMeans( + trainData = trainData, + featureEngineeringSettings = featureEngineeringSettings + ) + + testthat::expect_true(nrow(stratifiedMeans) == 8) + +imputedData <- imputeMissingMeans( + trainData = trainData, + covariateId = 12101, + ageSplits = c(20,50,70), + stratifiedMeans = stratifiedMeans +) + +testthat::expect_true( + nrow(as.data.frame(imputedData$covariateData$covariates)) == 200 +) + +}) \ No newline at end of file diff --git a/tests/testthat/test-featureEngineering_updated.R b/tests/testthat/test-featureEngineering_updated.R deleted file mode 100644 index 8d05d1650..000000000 --- a/tests/testthat/test-featureEngineering_updated.R +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright 2021 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -library("testthat") -context("FeatureEngineering") - - -trainData <- createTrainData(plpData, population) - -testFEFun <- function(type = 'none'){ - - result <- createFeatureEngineeringSettings(type = type) - - return(result) -} - - -test_that("createFeatureEngineeringSettings correct class", { - - featureEngineeringSettings <- testFEFun() - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') - - checkFun <- 'sameData' # this is the only option at the moment, edit this when more are added - expect_equal(attr(featureEngineeringSettings, "fun"), checkFun) - -}) - - -testUniFun <- function(k = 100){ - - result <- createUnivariateFeatureSelection(k = k) - - return(result) -} - - - -test_that("createUnivariateFeatureSelection correct class", { - k <- sample(1000,1) - featureEngineeringSettings <- testUniFun(k = k) - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') - expect_equal(featureEngineeringSettings$k, k) - expect_equal(attr(featureEngineeringSettings, "fun"), 'univariateFeatureSelection') - - expect_error(testUniFun(k = 'ffdff')) - expect_error(testUniFun(k = NULL)) - expect_error(testUniFun(k = -1)) -}) - - -test_that("univariateFeatureSelection", { - - k <- 20+sample(100,1) - featureEngineeringSettings <- testUniFun(k = k) - - trainDataCovariateSize <- trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - - reducedTrainData <- univariateFeatureSelection( - trainData = trainData, - featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = NULL - ) - - newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - expect_true(newDataCovariateSize <= trainDataCovariateSize) - - # expect k many covariates left - REMOVED AS TIES MAKES THIS FAIL OCCASIONALLY - ##expect_true(abs(k - reducedTrainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) <= 5) - -}) - -# refresh the training data -trainData <- createTrainData(plpData, population) - -test_that("createRandomForestFeatureSelection correct class", { - ntreesTest <- sample(1000,1) - maxDepthTest <- sample(20,1) - featureEngineeringSettings <- createRandomForestFeatureSelection( - ntrees = ntreesTest, - maxDepth = maxDepthTest - ) - - expect_is(featureEngineeringSettings, 'featureEngineeringSettings') - expect_equal(featureEngineeringSettings$ntrees, ntreesTest) - expect_equal(featureEngineeringSettings$max_depth, maxDepthTest) - expect_equal(attr(featureEngineeringSettings, "fun"), 'randomForestFeatureSelection') - - # error due to params - expect_error( - createRandomForestFeatureSelection( - ntrees = -1, - maxDepth = maxDepthTest - ) - ) - - expect_error( - createRandomForestFeatureSelection( - ntrees = 'dfdfd', - maxDepth = maxDepthTest - ) - ) - - expect_error( - createRandomForestFeatureSelection( - ntrees = 50, - maxDepth = 'maxDepthTest' - ) - ) - - expect_error( - createRandomForestFeatureSelection( - ntrees = 50, - maxDepth = -1 - ) - ) - -}) - - -test_that("randomForestFeatureSelection", { - - ntreesTest <- sample(1000,1) - maxDepthTest <- sample(20,1) - featureEngineeringSettings <- createRandomForestFeatureSelection( - ntrees = ntreesTest, - maxDepth = maxDepthTest - ) - - trainDataCovariateSize <- trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - - reducedTrainData <- randomForestFeatureSelection( - trainData = trainData, - featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = NULL - ) - - newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() - expect_true(newDataCovariateSize < trainDataCovariateSize) - -}) diff --git a/tests/testthat/test-featureImportance.R b/tests/testthat/test-featureImportance.R index 6c964e2c3..d0140adda 100644 --- a/tests/testthat/test-featureImportance.R +++ b/tests/testthat/test-featureImportance.R @@ -21,12 +21,38 @@ context("FeatureImportance") test_that("pfi feature importance returns data.frame", { - pfiTest <- pfi(plpResult, population, plpData, repeats = 1, - covariates = NULL, cores = NULL, log = NULL, - logthreshold = "INFO") + # limit to a sample of 2 covariates for faster test + covariates <- plpResult$model$covariateImportance %>% + dplyr::filter("covariateValue" != 0) %>% + dplyr::select("covariateId") %>% + dplyr::arrange(desc("covariateValue")) %>% + dplyr::pull() - testthat::expect_equal(class(pfiTest), 'data.frame') - testthat::expect_equal(sum(names(pfiTest)%in%c("covariateId", "pfi")), 2) + # if the model had non-zero covariates + if(length(covariates) > 0){ + if (length(covariates) > 2) { + covariates <- covariates[1:2] + } + pfiTest <- pfi(plpResult, population, plpData, repeats = 1, + covariates = covariates, cores = 1, log = NULL, + logthreshold = "INFO") + + expect_equal(class(pfiTest), 'data.frame') + expect_equal(sum(names(pfiTest) %in% c("covariateId", "pfi")), 2) + expect_true(all(!is.nan(pfiTest$pfi))) + + } }) +test_that('pfi feature importance works with logger or without covariates', { + + pfiTest <- pfi(tinyResults, population, nanoData, cores = 1, + covariates = NULL, log = file.path(tempdir(), 'pfiLog')) + + expect_equal(class(pfiTest), 'data.frame') + expect_equal(sum(names(pfiTest) %in% c("covariateId", "pfi")), 2) + expect_true(all(!is.nan(pfiTest$pfi))) + +}) + diff --git a/tests/testthat/test-fitting_updated.R b/tests/testthat/test-fitting.R similarity index 78% rename from tests/testthat/test-fitting_updated.R rename to tests/testthat/test-fitting.R index 8f1fb5639..ad22e92bf 100644 --- a/tests/testthat/test-fitting_updated.R +++ b/tests/testthat/test-fitting.R @@ -18,7 +18,6 @@ library("testthat") context("Fitting") -trainData <- createTrainData(plpData, population) modelSettings <- setLassoLogisticRegression() test_that("fitPlp", { @@ -27,7 +26,8 @@ plpModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, search = "grid", - analysisId = 'fitting' + analysisId = 'fitting', + analysisPath = tempdir() ) expect_is(plpModel, 'plpModel') @@ -39,7 +39,8 @@ test_that("fitPlp input errors", { expect_error( fitPlp( trainData = trainData, - modelSettings = modelSettings + modelSettings = modelSettings, + analysisPath = tempDir() ) ) @@ -47,7 +48,8 @@ test_that("fitPlp input errors", { fitPlp( trainData = list(covariateData = NULL), modelSettings = modelSettings, - analysisId = 'fitting' + analysisId = 'fitting', + analysisPath = tempDir() ) ) @@ -55,6 +57,15 @@ test_that("fitPlp input errors", { fitPlp( trainData = trainData, modelSettings = NULL, + analysisId = 'fitting', + analysisPath = tempDir() + ) + ) + + expect_error( + fitPlp( + trainData = trainData, + modelSettings = modelSettings, analysisId = 'fitting' ) ) diff --git a/tests/testthat/test-formatting_updated.R b/tests/testthat/test-formatting.R similarity index 93% rename from tests/testthat/test-formatting_updated.R rename to tests/testthat/test-formatting.R index 62c843b95..0e3ac0121 100644 --- a/tests/testthat/test-formatting_updated.R +++ b/tests/testthat/test-formatting.R @@ -58,7 +58,7 @@ test_that("MapIds with no cohort", { # some covariates not in data 5,6,7 so should be removed from covRef expect_equal(mappings$covariateRef %>% dplyr::tally() %>% dplyr::pull(), 7) - correctCov <- mappings$covariateRef %>% dplyr::select(.data$covariateId) %>% dplyr::pull() %in% c(123,2002,10,3,4,9,8) + correctCov <- mappings$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull() %in% c(123,2002,10,3,4,9,8) expect_equal(sum(correctCov), length(correctCov)) }) @@ -92,7 +92,7 @@ test_that("MapIds with a cohort", { # some covariates not in data 5,6,7 so should be removed from covRef expect_equal(mappings$covariateRef %>% dplyr::tally() %>% dplyr::pull(), 3) - correctCov <- mappings$covariateRef %>% dplyr::select(.data$covariateId) %>% dplyr::pull() %in% c(123,9,8) + correctCov <- mappings$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull() %in% c(123,9,8) expect_equal(sum(correctCov), length(correctCov)) }) @@ -102,7 +102,7 @@ test_that("toSparseM", { cohorts <- data.frame(rowId=1:6, subjectId=1:6, - cohortId=rep(1,6), + targetId=rep(1,6), cohortStartDate= rep('2007-12-28 00:00:00.0',6), daysFromObsStart= c(500,50,500,500,500,500), daysToCohortEnd= rep(200,6), @@ -115,7 +115,7 @@ test_that("toSparseM", { outcomes=2)) outcomes <- data.frame(rowId=c(1,2), - outcomeId=rep(2,2), + outcomeId=rep(outcomeId,2), daysToEvent=c(150,40)) FplpData <- list(cohorts=cohorts, @@ -157,7 +157,7 @@ test_that("toSparseM", { compTest <- as.matrix(test$dataMatrix) testthat::expect_equal(test$labels %>% dplyr::tally() %>% dplyr::pull(), length(population$rowId)) testthat::expect_equal(nrow(compTest), length(population$rowId)) - testthat::expect_true(ncol(compTest) <= nrow(plpData$covariateData$covariateRef)) + testthat::expect_true(ncol(compTest) <= plpData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) testthat::expect_equal(ncol(compTest), test$covariateRef %>% dplyr::tally() %>% dplyr::pull()) testthat::expect_equal(ncol(compTest), test$covariateMap %>% dplyr::tally() %>% dplyr::pull()) @@ -166,7 +166,7 @@ test_that("toSparseM", { test_that("checkRam", { - ramCheck <- checkRam(createCovariateData(), 0.8) + ramCheck <- checkRam(createCovariateData()) expect_true(ramCheck) }) diff --git a/tests/testthat/test-getCalibration_updated.R b/tests/testthat/test-getCalibration.R similarity index 97% rename from tests/testthat/test-getCalibration_updated.R rename to tests/testthat/test-getCalibration.R index 831e2ebb9..522fb6b98 100644 --- a/tests/testthat/test-getCalibration_updated.R +++ b/tests/testthat/test-getCalibration.R @@ -37,7 +37,7 @@ test_that("getCalibration binary", { expect_true('evaluation' %in% colnames(calib)) - calibBinary <-getCalibrationSummary_binary( + calibBinary <- getCalibrationSummary_binary( prediction = Eprediction, evalColumn = 'evaluation', numberOfStrata = 100, diff --git a/tests/testthat/test-helperfunctions_updated.R b/tests/testthat/test-helperfunctions.R similarity index 88% rename from tests/testthat/test-helperfunctions_updated.R rename to tests/testthat/test-helperfunctions.R index 413b82a8d..526d6466b 100644 --- a/tests/testthat/test-helperfunctions_updated.R +++ b/tests/testthat/test-helperfunctions.R @@ -46,5 +46,12 @@ test_that("setPythonEnvironment", { testthat::expect_equal(class(setPythonEnvironment(envname='madeup34343', envtype = 'conda')), "character") }) +test_that("Borrowed cut2", { + x <- c(1, rep(2, 2), rep(4, 4), rep(5, 5), rep(6, 6)) + groups <- PatientLevelPrediction:::cut2(x, g = 3) + expect_true( + all(levels(groups) == c("[1,5)", "5", "6")) + ) +}) # getOs test? diff --git a/tests/testthat/test-learningCurves_updated.R b/tests/testthat/test-learningCurves.R similarity index 64% rename from tests/testthat/test-learningCurves_updated.R rename to tests/testthat/test-learningCurves.R index e8b553274..15da60bcf 100644 --- a/tests/testthat/test-learningCurves_updated.R +++ b/tests/testthat/test-learningCurves.R @@ -19,11 +19,11 @@ context("LearningCurves") # learningCurve learningCurve <- PatientLevelPrediction::createLearningCurve( plpData = plpData, - outcomeId = 2, parallel = T, cores = 3, + outcomeId = outcomeId, parallel = F, cores = -1, modelSettings = setLassoLogisticRegression(), saveDirectory = file.path(saveLoc, 'lcc'), - splitSettings = createDefaultSplitSetting(testFraction = 0.2), - trainFractions = c(0.6,0.7,0.8), + splitSettings = createDefaultSplitSetting(testFraction = 0.2, nfold=2), + trainFractions = c(0.6,0.7), trainEvents = NULL, preprocessSettings = createPreprocessSettings( minFraction = 0.001, @@ -42,7 +42,7 @@ test_that("learningCurve output correct", { "Train_populationSize", "Train_outcomeCount") ),5) - testthat::expect_equal(learningCurve$trainFraction, c(0.6,0.7,0.8)*100) + testthat::expect_equal(learningCurve$trainFraction, c(0.6,0.7)*100) }) @@ -64,4 +64,28 @@ test_that("plotLearningCurve", { }) +test_that("getTrainFractions works", { + + learningCurve <- PatientLevelPrediction::createLearningCurve( + plpData = tinyPlpData, + outcomeId = outcomeId, parallel = F, cores = -1, + modelSettings = setLassoLogisticRegression(seed = 42), + saveDirectory = file.path(saveLoc, 'lcc'), + splitSettings = createDefaultSplitSetting(testFraction = 0.33, nfold = 2, + splitSeed = 42), + trainEvents = c(150,200), + preprocessSettings = createPreprocessSettings( + minFraction = 0.001, + normalize = T + ) + ) + testthat::expect_true(is.data.frame(learningCurve)) + testthat::expect_equal(sum(colnames(learningCurve) %in% c( + "trainFraction", + "Train_AUROC", + "nPredictors", + "Train_populationSize", + "Train_outcomeCount") ),5) + +}) diff --git a/tests/testthat/test-multiplePlp_updated.R b/tests/testthat/test-multiplePlp.R similarity index 59% rename from tests/testthat/test-multiplePlp_updated.R rename to tests/testthat/test-multiplePlp.R index 09aa74adc..14cf651dc 100644 --- a/tests/testthat/test-multiplePlp_updated.R +++ b/tests/testthat/test-multiplePlp.R @@ -17,43 +17,29 @@ library("testthat") context("MultiplePlp") -connectionDetails <- Eunomia::getEunomiaConnectionDetails() -Eunomia::createCohorts(connectionDetails) - -databaseDetails <- createDatabaseDetails( - connectionDetails = connectionDetails, - cdmDatabaseSchema = "main", - cdmDatabaseName = "main", - cohortDatabaseSchema = "main", - cohortTable = "cohort", - outcomeDatabaseSchema = "main", - outcomeTable = "cohort", - cohortId = 1, - outcomeIds = 3, #make this ids - cdmVersion = 5) - - analysis1 <- createModelDesign( targetId = 1, - outcomeId = 3, + outcomeId = outcomeId, restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), + covariateSettings = covariateSettings, featureEngineeringSettings = NULL, sampleSettings = NULL, + splitSettings = createDefaultSplitSetting(splitSeed = 1), preprocessSettings = createPreprocessSettings(), modelSettings = setLassoLogisticRegression(seed = 12) ) test_that("createModelDesign - test working", { - expect_equal(analysis1$targetId, 1) - expect_equal(analysis1$outcomeId, 3) + expect_equal(analysis1$outcomeId, outcomeId) expect_equal(analysis1$restrictPlpDataSettings, createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0)) - expect_equal(analysis1$covariateSettings, FeatureExtraction::createDefaultCovariateSettings()) - expect_equal(analysis1$featureEngineeringSettings, createFeatureEngineeringSettings()) + expect_equal(analysis1$covariateSettings, covariateSettings) + expect_equal(analysis1$featureEngineeringSettings, list(createFeatureEngineeringSettings(type= "none"))) + expect_equal(analysis1$sampleSettings, list(createSampleSettings(type = 'none'))) expect_equal(analysis1$preprocessSettings, createPreprocessSettings()) + expect_equal(analysis1$splitSettings, createDefaultSplitSetting(splitSeed = 1)) expect_equal(analysis1$modelSettings, setLassoLogisticRegression(seed = 12)) expect_equal( analysis1$executeSettings, @@ -88,133 +74,47 @@ test_that("loading analyses settings", { expect_equal(analysis1$targetId, analysisSetting$analyses[[1]]$targetId) expect_equal(analysis1$outcomeId, analysisSetting$analyses[[1]]$outcomeId) expect_equal(analysis1$restrictPlpDataSettings, analysisSetting$analyses[[1]]$restrictPlpDataSettings) - expect_equal(attr(analysis1$covariateSettings, 'fun'), attr(analysisSetting$analyses[[1]]$covariateSettings[[1]],'fun') ) + expect_equal(attr(analysis1$covariateSettings, 'fun'), attr(analysisSetting$analyses[[1]]$covariateSettings,'fun') ) expect_equal(analysis1$populationSettings, analysisSetting$analyses[[1]]$populationSettings) - expect_equal(analysis1$sampleSettings, analysisSetting$analyses[[1]]$sampleSettings[[1]]) - expect_equal(attr(analysis1$featureEngineeringSettings,'class'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings[[1]],'class')) - expect_equal(attr(analysis1$featureEngineeringSettings,'fun'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings[[1]],'fun')) + expect_equal(analysis1$sampleSettings, analysisSetting$analyses[[1]]$sampleSettings) + expect_equal(attr(analysis1$featureEngineeringSettings,'class'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings,'class')) + expect_equal(attr(analysis1$featureEngineeringSettings,'fun'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings,'fun')) expect_equal(analysis1$preprocessSettings, analysisSetting$analyses[[1]]$preprocessSettings) expect_equal(analysis1$modelSettings, analysisSetting$analyses[[1]]$modelSettings) + expect_equal(analysis1$splitSettings, analysisSetting$analyses[[1]]$splitSettings) expect_equal(analysis1$executeSettings, analysisSetting$analyses[[1]]$executeSettings) - } ) -analysis2 <- createModelDesign( - targetId = 10, - outcomeId = 2, - restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 9999), - populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createCovariateSettings(useDemographicsAge = T), - featureEngineeringSettings = NULL, - sampleSettings = NULL, - preprocessSettings = createPreprocessSettings(), - modelSettings = setLassoLogisticRegression(seed = 12) -) - -test_that("getSettingValues works", { - - # works for single setting: - result <- getSettingValues( - modelDesignList = list(analysis1), - type = 'targetId' - ) - - expect_equal(nrow(result), 1) - expect_equal(result$value, 1) - - - # works for multiple setting: - result <- getSettingValues( - modelDesignList = list(analysis1, analysis2), - type = 'targetId' - ) - - expect_equal(nrow(result), 2) - expect_equal(result$value, c(1,10)) - - -} - ) - - -test_that("getSettingValues works", { - - result <- getidList(modelDesignList = list(analysis1)) - expect_is(result, 'list') - result <- getidList(modelDesignList = list(analysis1, analysis2)) - expect_is(result, 'list') - expect_equal(nrow(result$targetId), 2) - expect_equal(nrow(result$covariateSettings), 2) -} -) - -test_that("getSettingFromId works", { - - result <- getidList(modelDesignList = list(analysis1, analysis2)) - - cov <- getSettingFromId(idList = result, type = 'covariateSettings', id = 1) - expect_equal(names(cov), names(FeatureExtraction::createDefaultCovariateSettings())) - - id <- getSettingFromId(idList = result, type = 'targetId', id = 10) - expect_equal(id, 10) - -}) - - -test_that("getSettingsTable", { - - result <- getidList(modelDesignList = list(analysis1, analysis2)) - - settingsTable <- getSettingsTable( - modelDesignList = list(analysis1, analysis2), - idList = result - ) - expect_is(settingsTable, 'data.frame') - -}) - -test_that("getDataSettings", { - - result <- getidList(modelDesignList = list(analysis1, analysis2)) - - settingsTable <- getSettingsTable( - modelDesignList = list(analysis1, analysis2), - idList = result - ) - dataSettings <- getDataSettings(settingsTable) - expect_is(dataSettings, 'list') - expect_equal(length(dataSettings), 2) - -}) - test_that("test run multiple", { analysis3 <- createModelDesign( targetId = 1, - outcomeId = 3, + outcomeId = outcomeId, restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), + covariateSettings = covariateSettings, featureEngineeringSettings = createFeatureEngineeringSettings(), sampleSettings = createSampleSettings(), preprocessSettings = createPreprocessSettings(), - modelSettings = setLassoLogisticRegression(seed = 12) + modelSettings = setLassoLogisticRegression(seed = 12), + splitSettings = createDefaultSplitSetting( + type = "stratified", + testFraction = 0.25, + trainFraction = 0.75, + splitSeed = 123, + nfold = 3 + ), + runCovariateSummary = FALSE ) runMultiplePlp( databaseDetails = databaseDetails, modelDesignList = list( + # add this twice to make sure no issue with overlapping ids? analysis3 ), onlyFetchData = F, - splitSettings = createDefaultSplitSetting( - type = "stratified", - testFraction = 0.25, - trainFraction = 0.75, - splitSeed = 123, - nfold = 3 - ), logSettings = createLogSettings( verbosity = "DEBUG", timeStamp = T, @@ -223,6 +123,7 @@ test_that("test run multiple", { saveDirectory = file.path(saveLoc, 'multiple') ) + expect_true(file.exists(file.path(saveLoc, 'multiple', 'settings.csv'))) expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Analysis_1'))) expect_true(file.exists(file.path(saveLoc, 'multiple', 'Analysis_1','plpResult', 'runPlp.rds'))) diff --git a/tests/testthat/test-paramchecks_updated.R b/tests/testthat/test-paramchecks.R similarity index 100% rename from tests/testthat/test-paramchecks_updated.R rename to tests/testthat/test-paramchecks.R diff --git a/tests/testthat/test-plotting_updated.R b/tests/testthat/test-plotting.R similarity index 94% rename from tests/testthat/test-plotting_updated.R rename to tests/testthat/test-plotting.R index a3989c9b6..17039166e 100644 --- a/tests/testthat/test-plotting_updated.R +++ b/tests/testthat/test-plotting.R @@ -38,7 +38,7 @@ test_that("plots", { test <- plotF1Measure(plpResult, typeColumn = 'evaluation') testthat::expect_s3_class(test, 'arrangelist') - if(!is.null(plpResult$performanceEvaluation$demographicSummary)){ + if (!is.null(plpResult$performanceEvaluation$demographicSummary)) { test <- plotDemographicSummary(plpResult, typeColumn = 'evaluation') testthat::expect_s3_class(test, 'arrangelist') } @@ -52,7 +52,7 @@ test_that("plots", { test <- plotVariableScatterplot(plpResult$covariateSummary) testthat::expect_s3_class(test, 'ggplot') - test <- plotGeneralizability(plpResult$covariateSummary, fileName=NULL) + test <- plotGeneralizability(plpResult$covariateSummary, fileName = NULL) testthat::expect_s3_class(test, 'grob') }) @@ -61,7 +61,7 @@ test_that("plots", { test_that("outcomeSurvivalPlot", { # test the plot works - test <- outcomeSurvivalPlot(plpData = plpData, outcomeId = 2) + test <- outcomeSurvivalPlot(plpData = plpData, outcomeId = outcomeId) testthat::expect_s3_class(test, 'ggsurvplot') testthat::expect_error(outcomeSurvivalPlot()) @@ -82,7 +82,7 @@ test_that("plotPlp", { testthat::expect_equal(dir.exists(file.path(saveLoc,'plots')), T) # expect plots to be there - expect_true(length(dir(file.path(saveLoc,'plots')))>0) + expect_true(length(dir(file.path(saveLoc,'plots'))) > 0) }) @@ -134,10 +134,4 @@ test_that("plotSmoothCalibration", { ) ) -}) - - - - - - +}) \ No newline at end of file diff --git a/tests/testthat/test-population_updated.R b/tests/testthat/test-population.R similarity index 90% rename from tests/testthat/test-population_updated.R rename to tests/testthat/test-population.R index 124b2d17e..6add83fee 100644 --- a/tests/testthat/test-population_updated.R +++ b/tests/testthat/test-population.R @@ -293,7 +293,7 @@ test_that("population creation parameters", { studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings() ) @@ -306,7 +306,7 @@ test_that("population creation parameters", { #firstExposureOnly test (should have no effect on simulated data) studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(firstExposureOnly = T) ) @@ -317,7 +317,7 @@ test_that("population creation parameters", { #requireTimeAtRisk studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(requireTimeAtRisk = T) ) @@ -330,7 +330,7 @@ test_that("population creation parameters", { expect_warning( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(requireTimeAtRisk = T, minTimeAtRisk = 99999) ) ) @@ -338,7 +338,7 @@ test_that("population creation parameters", { #washoutPeriod = 365, studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(washoutPeriod = 365) ) nrOutcomes4 <- sum(studyPopulation$outcomeCount) @@ -349,7 +349,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(washoutPeriod = -1) ) ) @@ -358,7 +358,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( priorOutcomeLookback = -1, removeSubjectsWithPriorOutcome = T @@ -370,7 +370,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( minTimeAtRisk = -1, requireTimeAtRisk = T @@ -382,7 +382,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( startAnchor = 'cohort stard' ) @@ -394,7 +394,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( endAnchor = 'cohort ent' ) @@ -411,7 +411,7 @@ test_that("population creation parameters", { ) cohorts <- data.frame(rowId=1:20, subjectId=1:20, - cohortId=rep(2,20), + targetId=rep(2,20), time=rep(365,20), ageYear = rep(18,20), gender = rep(8507,20), @@ -495,4 +495,31 @@ test_that("population creation parameters", { }) +testthat::test_that("Providing an existing population and skipping population creation works", { + popSize <- 400 + newPopulation <- population[sample.int(nrow.default(population), popSize), ] + + tinyPlpData$population <- newPopulation + + plpResults <- runPlp( + plpData = tinyPlpData, + outcomeId = 2, + analysisId = "1", + analysisName = "existing population", + populationSettings = createStudyPopulationSettings(), + splitSettings = createDefaultSplitSetting(), + modelSettings = setLassoLogisticRegression(), + executeSettings = createExecuteSettings( + runSplitData = TRUE, + runPreprocessData = FALSE, + runModelDevelopment = TRUE + ) + ) + + trainPredictions <- plpResults$prediction %>% + dplyr::filter(.data$evaluationType == "Train") %>% nrow.default() + testPredictions <- plpResults$prediction %>% + dplyr::filter(.data$evaluationType == "Test") %>% nrow.default() + expect_equal(popSize, trainPredictions + testPredictions) +}) diff --git a/tests/testthat/test-prediction_updated.R b/tests/testthat/test-prediction.R similarity index 94% rename from tests/testthat/test-prediction_updated.R rename to tests/testthat/test-prediction.R index dbf5f65d6..85aecf052 100644 --- a/tests/testthat/test-prediction_updated.R +++ b/tests/testthat/test-prediction.R @@ -57,7 +57,7 @@ test_that("prediction works", { ) # check metaData - expect_equal(length(names(attr(pred, "metaData"))), 4) # 6 if survivial + expect_equal(length(names(attr(pred, "metaData"))), 6) # 8 if survivial # add single person pred and compare with manual cal @@ -70,7 +70,7 @@ test_that("prediction works", { test_that("applyTidyCovariateData", { - covariateIds <- plpData$covariateData$covariateRef %>% dplyr::select(.data$covariateId) %>% dplyr::pull() + covariateIds <- plpData$covariateData$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull() remove <- sample(covariateIds, 10) deletedRedundantCovariateIds = remove[1:5] deletedInfrequentCovariateIds = remove[6:10] @@ -95,7 +95,7 @@ test_that("applyTidyCovariateData", { # some covariates removed expect_true(newCovariateData$covariates %>% dplyr::tally() %>% dplyr::pull() < covariateCount) - newCovs <- newCovariateData$covariateRef %>% dplyr::select(.data$covariateId) %>% dplyr::pull() + newCovs <- newCovariateData$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull() expect_equal(sum(covariateIds[!covariateIds %in% newCovs] %in% remove),10) diff --git a/tests/testthat/test-preprocessingData_updated.R b/tests/testthat/test-preprocessingData.R similarity index 86% rename from tests/testthat/test-preprocessingData_updated.R rename to tests/testthat/test-preprocessingData.R index ca8259ecc..9b3755b72 100644 --- a/tests/testthat/test-preprocessingData_updated.R +++ b/tests/testthat/test-preprocessingData.R @@ -58,11 +58,10 @@ test_that("createPreprocessSettings", { }) test_that("createPreprocessSettings", { - trainData <- createTrainData(plpData, population) - + attr(trainData$covariateData, "metaData")$preprocessSettings <- NULL # removing for test metaData <- attr(trainData$covariateData, "metaData") - covSize <- trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() metaLength <- length(metaData) + covSize <- trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() oldFeatureCount <- trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull() preprocessSettings <- createDefaultSettings( @@ -73,11 +72,10 @@ test_that("createPreprocessSettings", { newData <- preprocessData(trainData$covariateData, preprocessSettings) expect_is(newData, 'CovariateData') - expect_equal(length(attr(newData, "metaData")), 1+length(metaData)) expect_true(newData$covariates %>% dplyr::tally() %>% dplyr::pull() < covSize) - # metaData should have tidyCovariateDataSettings (so 1 bigger) - expect_equal(length(attr(newData, "metaData")), metaLength+1) + # metaData should have tidyCovariateDataSettings + preprocessSettings (so 2 bigger) + expect_equal(length(attr(newData, "metaData")), metaLength+2) expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds)>=0) expect_equal(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds, NULL) @@ -87,7 +85,8 @@ test_that("createPreprocessSettings", { expect_equal(newFeatureCount, oldFeatureCount) - trainData <- createTrainData(plpData, population) + oldFeatureCount <- trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull() + metaData <- attr(trainData$covariateData, "metaData") preprocessSettings <- createDefaultSettings( minCovariateFraction = 0, @@ -102,6 +101,13 @@ test_that("createPreprocessSettings", { newFeatureCount <- newData$covariateRef %>% dplyr::tally() %>% dplyr::pull() + length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds) - expect_equal(newFeatureCount, oldFeatureCount) + expect_equal(newFeatureCount, oldFeatureCount) # sometimes differ? + + # check settings are saved + expect_equal(attr(newData, "metaData")$preprocessSettings, preprocessSettings) }) + +test_that('Did tidy on test', { + expect_true(attr(plpResult$prediction, 'metaData')$tidyCovariates) +}) diff --git a/tests/testthat/test-rclassifier_updated.R b/tests/testthat/test-rclassifier.R similarity index 93% rename from tests/testthat/test-rclassifier_updated.R rename to tests/testthat/test-rclassifier.R index 0781a8301..5a0dadc31 100644 --- a/tests/testthat/test-rclassifier_updated.R +++ b/tests/testthat/test-rclassifier.R @@ -41,7 +41,7 @@ test_that("GBM settings work", { expect_equal(gbmSet$fitFunction, 'fitRclassifier') expect_is(gbmSet$param, 'list') - expect_equal(attr(gbmSet$param, 'settings')$modeType, 'Xgboost') + expect_equal(attr(gbmSet$param, 'settings')$modelType, 'Xgboost') expect_equal(attr(gbmSet$param, 'settings')$seed, seed) expect_equal(attr(gbmSet$param, 'settings')$modelName, "Gradient Boosting Machine") @@ -88,13 +88,13 @@ testthat::expect_error(setGradientBoostingMachine(scalePosWeight = -1)) test_that("GBM working checks", { - trainData <- createTrainData(plpData = plpData, population = population) modelSettings <- setGradientBoostingMachine(ntrees = 10, maxDepth = 3, learnRate = 0.1) fitModel <- fitPlp( trainData = trainData, modelSettings = modelSettings, - analysisId = 'gbmTest' + analysisId = 'gbmTest', + analysisPath = tempdir() ) expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) @@ -108,8 +108,9 @@ test_that("GBM working checks", { expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - expect_equal(fitModel$trainDetails$outcomeId, 2) - expect_equal(fitModel$trainDetails$cohortId, 1) + expect_equal(fitModel$modelDesign$outcomeId, outcomeId) + expect_equal(fitModel$modelDesign$targetId, 1) + # TODO check other model design values? # test that at least some features have importances that are not zero expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE) diff --git a/tests/testthat/test-recalibration_updated.R b/tests/testthat/test-recalibration.R similarity index 76% rename from tests/testthat/test-recalibration_updated.R rename to tests/testthat/test-recalibration.R index ba992bd76..33e717197 100644 --- a/tests/testthat/test-recalibration_updated.R +++ b/tests/testthat/test-recalibration.R @@ -31,8 +31,8 @@ prediction <- data.frame( metaData <- list( modelType = "binary", - cohortId = 1, - outcomeId = 2, + targetId = 1, + outcomeId = outcomeId, timepoint = 365 ) @@ -62,17 +62,28 @@ testthat::expect_true(sum(test$evaluationType == 'weakRecalibration') == 100) test_that("recalibratePlpRefit", { + + newPop <- plpResult$prediction %>% dplyr::select(-"value") %>% dplyr::filter(.data$evaluationType %in% c('Test','Train')) + attr(newPop, 'metaData') <- list( + targetId = 1, + outcomeId = outcomeId, + restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(), + populationSettings = PatientLevelPrediction::createStudyPopulationSettings() + ) + testRecal <- recalibratePlpRefit( plpModel = plpResult$model, - newPopulation = plpResult$prediction %>% dplyr::select(-.data$value) %>% dplyr::filter(.data$evaluationType %in% c('Test','Train')), + newPopulation = newPop, newData = plpData ) - testthat::expect_true( - sum(testRecal$evaluationType == 'recalibrationRefit')>0 - ) - - testthat::expect_s3_class(testRecal, 'data.frame') + if(!is.null(testRecal)){ + testthat::expect_true( + sum(testRecal$evaluationType == 'recalibrationRefit')>0 + ) + + testthat::expect_s3_class(testRecal, 'data.frame') + } # add more test... }) @@ -83,8 +94,8 @@ test_that("survival", { # survival metaData <- list( modelType = "survival", - cohortId = 1, - outcomeId = 2, + targetId = 1, + outcomeId = outcomeId, timepoint = 365 ) diff --git a/tests/testthat/test-runPlpHelpers_updated.R b/tests/testthat/test-runPlpHelpers.R similarity index 97% rename from tests/testthat/test-runPlpHelpers_updated.R rename to tests/testthat/test-runPlpHelpers.R index 38e36b423..4d361410d 100644 --- a/tests/testthat/test-runPlpHelpers_updated.R +++ b/tests/testthat/test-runPlpHelpers.R @@ -21,8 +21,8 @@ test_that("check printHeader runs", { header <- printHeader( plpData = plpData, - cohortId = 1, - outcomeId = 2, + targetId = 1, + outcomeId = outcomeId, analysisId = 123, analysisName = 'test', ExecutionDateTime = Sys.time() @@ -36,7 +36,7 @@ test_that("checkInputs", { check <- checkInputs( list( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = populationSettings ) ) @@ -49,7 +49,7 @@ test_that("checkInputs", { checkInputs( list( plpData = NULL, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = populationSettings ) ) @@ -71,7 +71,7 @@ test_that("checkInputs", { checkInputs( list( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = 'populationSettings' ) ) diff --git a/tests/testthat/test-sampling_updated.R b/tests/testthat/test-sampling.R similarity index 63% rename from tests/testthat/test-sampling_updated.R rename to tests/testthat/test-sampling.R index b6603a633..967dd6dc6 100644 --- a/tests/testthat/test-sampling_updated.R +++ b/tests/testthat/test-sampling.R @@ -62,8 +62,12 @@ test_that("createSampleSettings works", { testNumberOutcomestoNonOutcomes ) + # the seed is ignored if sameData + if(testType == 'none'){ + testSampleSeed <- 1 + } expect_equal( - sampleSettings$sampleSeed , + sampleSettings$sampleSeed, testSampleSeed ) @@ -95,37 +99,38 @@ test_that("createSampleSettings expected errors", { test_that("sampleData outputs are correct", { - trainData <- createTrainData(plpData, population) + newTrainData <- trainData + attr(newTrainData, "metaData")$sampleSettings <- NULL # remove for test sampleSettings <- sampleSettingFunc(type = 'none') - sampleData <- sampleData(trainData, sampleSettings) + sampleData <- sampleData(newTrainData, sampleSettings) # make sure metaData captures expect_equal( length(attr(sampleData, "metaData")), - length(attr(trainData, "metaData"))+1 + length(attr(newTrainData, "metaData"))+1 ) expect_equal( - attr(sampleData, "metaData")$sampleSettings, + attr(sampleData, "metaData")$sampleSettings[[1]], sampleSettings ) # check the data is the same: expect_equal( nrow(sampleData$labels), - nrow(trainData$labels) + nrow(newTrainData$labels) ) expect_equal( nrow(sampleData$folds), - nrow(trainData$folds) + nrow(newTrainData$folds) ) expect_equal( sampleData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(), - trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() ) @@ -136,7 +141,7 @@ test_that("sampleData outputs are correct", { test_that("underSampleData works", { - trainData <- createTrainData(plpData, population) + newTrainData <- trainData sampleSettings <- list( sampleSeed = 1, @@ -145,13 +150,18 @@ test_that("underSampleData works", { underSampleData <- underSampleData(trainData, sampleSettings) + expect_true(inherits(underSampleData, 'plpData')) # add test based on github issue + # the sampled data should be smaller... - expect_true(nrow(underSampleData$labels) <= nrow(trainData$labels)) + expect_true(nrow(underSampleData$labels) <= nrow(newTrainData$labels)) - expect_true(nrow(underSampleData$folds) <= nrow(trainData$folds)) + expect_true(nrow(underSampleData$folds) <= nrow(newTrainData$folds)) expect_true( - underSampleData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() <= trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() + underSampleData$covariateData$covariates %>% + dplyr::tally() %>% + dplyr::pull() <= newTrainData$covariateData$covariates %>% + dplyr::tally() %>% dplyr::pull() ) # perhaps add manual data test @@ -159,3 +169,32 @@ test_that("underSampleData works", { }) + +test_that("overSampleData works", { + + newTrainData <- trainData + + sampleSettings <- list( + sampleSeed = 1, + numberOutcomestoNonOutcomes = 0.5 + ) + + overSampleData <- overSampleData(newTrainData, sampleSettings) + + expect_true(inherits(overSampleData, 'plpData')) # add test based on github issue + + # the sampled data should be smaller... + expect_true(nrow(overSampleData$labels) >= nrow(newTrainData$labels)) + + expect_true(nrow(overSampleData$folds) >= nrow(newTrainData$folds)) + + expect_true( + overSampleData$covariateData$covariates %>% dplyr::tally() %>% + dplyr::pull() >= newTrainData$covariateData$covariates %>% + dplyr::tally() %>% dplyr::pull() + ) + + # perhaps add manual data test + + +}) diff --git a/tests/testthat/test-saveloadplp_updated.R b/tests/testthat/test-saveloadplp.R similarity index 75% rename from tests/testthat/test-saveloadplp_updated.R rename to tests/testthat/test-saveloadplp.R index 5f4fdc794..ed71009ce 100644 --- a/tests/testthat/test-saveloadplp_updated.R +++ b/tests/testthat/test-saveloadplp.R @@ -41,12 +41,12 @@ test_that("loadPlpDataError", { }) test_that("loadPlpData", { - plpData <- loadPlpData(file = file.path(saveLoc,"saveDataTest")) - expect_identical(plpData$cohorts, oldCohorts) - expect_identical(plpData$outcomes, oldOutcomes) - expect_equal(as.data.frame(plpData$covariateData$covariates), + loadedData <- loadPlpData(file = file.path(saveLoc,"saveDataTest")) + expect_identical(loadedData$cohorts, oldCohorts) + expect_identical(loadedData$outcomes, oldOutcomes) + expect_equal(as.data.frame(loadedData$covariateData$covariates), oldCovariates) - expect_equal(as.data.frame(plpData$covariateData$covariateRef), + expect_equal(as.data.frame(loadedData$covariateData$covariateRef), oldCovariateRef) }) @@ -186,3 +186,53 @@ test_that("savePlpShareable works", { ) }) + +# Note: saving from database to csv is in the database upload test file + + + +test_that("applyMinCellCount works", { + + result <- data.frame( + performance_id = 1:2, + covariate_id = 1:2, + covariate_name = paste0('name', 1:2), + concept_id = 1:2, + covariate_value = runif(2), + covariate_count = c(100,50), + covariate_mean = runif(2), + covariate_st_dev = runif(2), + with_no_outcome_covariate_count = c(10,5), + with_no_outcome_covariate_mean = runif(2), + with_no_outcome_covariate_st_dev = runif(2), + with_outcome_covariate_count = c(90,45), + with_outcome_covariate_mean = runif(2), + with_outcome_covariate_st_dev = runif(2), + standardized_mean_diff = runif(2) + ) + + minCellResult <- applyMinCellCount( + tableName = "covariate_summary", + sensitiveColumns = getPlpSensitiveColumns(), + result = result, + minCellCount = 5 + ) + # check nothing removed + testthat::expect_equal(2,sum(minCellResult$covariate_count != -1)) + testthat::expect_equal(2,sum(minCellResult$with_no_outcome_covariate_count != -1)) + testthat::expect_equal(2,sum(minCellResult$with_outcome_covariate_count != -1)) + + # now check values are removed + minCellResult <- applyMinCellCount( + tableName = "covariate_summary", + sensitiveColumns = getPlpSensitiveColumns(), + result = result, + minCellCount = 10 + ) + testthat::expect_equal(0,sum(minCellResult$covariate_count == -1)) + testthat::expect_equal(minCellResult$with_no_outcome_covariate_count[2],-1) + testthat::expect_equal(1,sum(minCellResult$with_no_outcome_covariate_count == -1)) + testthat::expect_equal(1,sum(minCellResult$with_outcome_covariate_count == -1)) + + +}) diff --git a/tests/testthat/test-sklearnClassifier.R b/tests/testthat/test-sklearnClassifier.R new file mode 100644 index 000000000..c87a1937b --- /dev/null +++ b/tests/testthat/test-sklearnClassifier.R @@ -0,0 +1,205 @@ + + +test_that("DecisionTree settings work checks", { + +dtset <- setDecisionTree( + criterion = list('gini'), + splitter = list('best'), + maxDepth = list(4, 10, NULL), + minSamplesSplit = list(2, 10), + minSamplesLeaf = list(10, 50), + minWeightFractionLeaf = list(0), + maxFeatures = list(100,'sqrt', NULL), + maxLeafNodes = list(NULL), + minImpurityDecrease = list(10^-7), + classWeight = list(NULL), + seed = sample(1000000,1) +) + +expect_equal(dtset$fitFunction, "fitSklearn") + +expect_equal(length(dtset$param), 3*2*2*3*1) + +expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[1]]))), 'gini') +expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[2]]))), 'best') +expect_equal(length(unique(lapply(dtset$param, function(x) x[[3]]))), 3) + +expect_false(attr(dtset$param, 'settings')$requiresDenseMatrix) +expect_equal(attr(dtset$param, 'settings')$name, 'Decision Tree') +expect_equal(attr(dtset$param, 'settings')$pythonModule, 'sklearn.tree') +expect_equal(attr(dtset$param, 'settings')$pythonClass, "DecisionTreeClassifier") + + +}) + + +test_that("DecisionTree errors as expected", { + + expect_error(setDecisionTree(criterion = list('madeup'))) + + expect_error(setDecisionTree(maxDepth = list(-1))) + expect_error(setDecisionTree(minSamplesSplit = list(-1))) + expect_error(setDecisionTree(minSamplesLeaf = list(-1))) + +}) + + +test_that("check fit of DecisionTree", { + + modelSettings <- setDecisionTree( + criterion = list('gini'), + splitter = list('best'), + maxDepth = list(as.integer(4)), + minSamplesSplit = list(2), + minSamplesLeaf = list(10), + minWeightFractionLeaf = list(0), + maxFeatures = list('sqrt'), + maxLeafNodes = list(NULL), + minImpurityDecrease = list(10^-7), + classWeight = list(NULL), + seed = sample(1000000,1) + ) + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'DecisionTree', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, trainData) + # add check for other model design settings + +}) + +test_that('fitSklearn errors with wrong covariateData', { + + newTrainData <- copyTrainData(trainData) + class(newTrainData$covariateData) <- 'notCovariateData' + modelSettings <- setAdaBoost() + analysisId <- 42 + + expect_error(fitSklearn(newTrainData, + modelSettings, + search='grid', + analysisId)) +}) + + +test_that('AdaBoost fit works', { + + modelSettings <- setAdaBoost(nEstimators = list(10), + learningRate = list(0.1), + ) + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'Adaboost', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model),"model.json") + +}) + + +test_that('RandomForest fit works', { + + modelSettings <- setRandomForest(ntrees=list(10), + maxDepth=list(4), + minSamplesSplit = list(2), + minSamplesLeaf = list(10), + mtries = list("sqrt"), + maxSamples = list(0.9), + classWeight = list(NULL)) + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'RandomForest', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model),"model.json") + +}) + + +test_that('MLP fit works', { + modelSettings <- setMLP( + hiddenLayerSizes = list(c(20)), + alpha = list(1e-6), + maxIter = list(50), + epsilon = list(1e-08), + learningRateInit = list(0.01), + tol = list(1e-2) # reduce tol so I don't get convergence warnings + ) + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'MLP', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model),"model.json") + +}) + + +test_that('Naive bayes fit works', { + modelSettings <- setNaiveBayes() + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'Naive bayes', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model),"model.json") + +}) + + +test_that('Support vector machine fit works', { + modelSettings <- setSVM(C = list(1), + degree = list(1), + gamma = list('scale'), + classWeight = list(NULL)) + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'SVM', + analysisPath = tempdir() + ) + + expect_correct_fitPlp(plpModel, trainData) + expect_equal(dir(plpModel$model),"model.json") + +}) + +test_that('Sklearn predict works', { + + modelSettings <- setAdaBoost(nEstimators = list(10), + learningRate = list(0.1), + ) + + plpModel <- fitPlp( + trainData = tinyTrainData, + modelSettings = modelSettings, + analysisId = 'Adaboost', + analysisPath = tempdir() + ) + + predictions <- predictPythonSklearn(plpModel, + testData, + population) + expect_correct_predictions(predictions, testData) +}) diff --git a/tests/testthat/test-sklearnClassifierHelpers_updated.R b/tests/testthat/test-sklearnClassifierHelpers.R similarity index 100% rename from tests/testthat/test-sklearnClassifierHelpers_updated.R rename to tests/testthat/test-sklearnClassifierHelpers.R diff --git a/tests/testthat/test-sklearnClassifierSettings.R b/tests/testthat/test-sklearnClassifierSettings.R index 9b380f595..29c3f7f3d 100644 --- a/tests/testthat/test-sklearnClassifierSettings.R +++ b/tests/testthat/test-sklearnClassifierSettings.R @@ -11,22 +11,20 @@ test_that("setAdaBoost settings work checks", { expect_equal(length(adset$param), 3*3*1) - expect_equal(unique(unlist(lapply(adset$param, function(x) x[[1]]))), NULL) - expect_equal(unique(unlist(lapply(adset$param, function(x) x[[2]]))), c(10,50, 200)) - expect_equal(unique(unlist(lapply(adset$param, function(x) x[[3]]))), c(1, 0.5, 0.1)) - expect_equal(unique(lapply(adset$param, function(x) x[[4]])), list('SAMME.R')) + expect_equal(unique(unlist(lapply(adset$param, function(x) x[[1]]))), c(10,50, 200)) + expect_equal(unique(unlist(lapply(adset$param, function(x) x[[2]]))), c(1, 0.5, 0.1)) + expect_equal(unique(lapply(adset$param, function(x) x[[3]])), list('SAMME.R')) expect_false(attr(adset$param, 'settings')$requiresDenseMatrix) expect_equal(attr(adset$param, 'settings')$name, 'AdaBoost') - expect_equal(attr(adset$param, 'settings')$pythonImport, 'sklearn') - expect_equal(attr(adset$param, 'settings')$pythonImportSecond, 'ensemble') - expect_equal(attr(adset$param, 'settings')$pythonClassifier, "AdaBoostClassifier") + expect_equal(attr(adset$param, 'settings')$pythonModule, 'sklearn.ensemble') + expect_equal(attr(adset$param, 'settings')$pythonClass, "AdaBoostClassifier") inputs <- AdaBoostClassifierInputs(list, adset$param[[1]]) expect_equal( names(inputs), - c("base_estimator","n_estimators","learning_rate","algorithm","random_state" ) + c("n_estimators","learning_rate","algorithm","random_state" ) ) }) @@ -42,10 +40,6 @@ test_that("setAdaBoost errors as expected", { }) - - - - test_that("setMLP settings work checks", { mlpset <- setMLP( @@ -83,9 +77,8 @@ test_that("setMLP settings work checks", { expect_false(attr(mlpset$param, 'settings')$requiresDenseMatrix) expect_equal(attr(mlpset$param, 'settings')$name, 'Neural Network') - expect_equal(attr(mlpset$param, 'settings')$pythonImport, 'sklearn') - expect_equal(attr(mlpset$param, 'settings')$pythonImportSecond, 'neural_network') - expect_equal(attr(mlpset$param, 'settings')$pythonClassifier, "MLPClassifier") + expect_equal(attr(mlpset$param, 'settings')$pythonModule, 'sklearn.neural_network') + expect_equal(attr(mlpset$param, 'settings')$pythonClass, "MLPClassifier") inputs <- MLPClassifierInputs(list, mlpset$param[[1]]) expect_equal( @@ -99,12 +92,6 @@ test_that("setMLP settings work checks", { }) - - - - - - test_that("setNaiveBayes settings work checks", { nbset <- setNaiveBayes( @@ -116,9 +103,8 @@ test_that("setNaiveBayes settings work checks", { expect_true(attr(nbset$param, 'settings')$requiresDenseMatrix) expect_equal(attr(nbset$param, 'settings')$name, 'Naive Bayes') - expect_equal(attr(nbset$param, 'settings')$pythonImport, 'sklearn') - expect_equal(attr(nbset$param, 'settings')$pythonImportSecond, 'naive_bayes') - expect_equal(attr(nbset$param, 'settings')$pythonClassifier, "GaussianNB") + expect_equal(attr(nbset$param, 'settings')$pythonModule, 'sklearn.naive_bayes') + expect_equal(attr(nbset$param, 'settings')$pythonClass, "GaussianNB") inputs <- GaussianNBInputs(list, nbset$param[[1]]) expect_equal(names(inputs),NULL) @@ -126,11 +112,6 @@ test_that("setNaiveBayes settings work checks", { }) - - - - - test_that("setRandomForest settings work checks", { rfset <- setRandomForest( @@ -140,29 +121,28 @@ test_that("setRandomForest settings work checks", { minSamplesSplit = list(2,5), minSamplesLeaf = list(1,10), minWeightFractionLeaf = list(0), - mtries = list('auto', 'log2'), + mtries = list('sqrt', 'log2'), maxLeafNodes = list(NULL), minImpurityDecrease = list(0), bootstrap = list(TRUE), maxSamples = list(NULL, 0.9), oobScore = list(FALSE), nJobs = list(NULL), - classWeight = list('balanced_subsample', NULL), + classWeight = list(NULL), seed = sample(100000,1) ) expect_equal(rfset$fitFunction, "fitSklearn") - expect_equal(length(rfset$param), 2*3*2*2*2*2*2) + expect_equal(length(rfset$param), 2*3*2*2*2*2*1) expect_equal(unique(lapply(rfset$param, function(x) x[[1]])), list(100,500)) expect_equal(unique(unlist(lapply(rfset$param, function(x) x[[3]]))), c(4,10,17)) expect_false(attr(rfset$param, 'settings')$requiresDenseMatrix) expect_equal(attr(rfset$param, 'settings')$name, 'Random forest') - expect_equal(attr(rfset$param, 'settings')$pythonImport, 'sklearn') - expect_equal(attr(rfset$param, 'settings')$pythonImportSecond, 'ensemble') - expect_equal(attr(rfset$param, 'settings')$pythonClassifier, "RandomForestClassifier") + expect_equal(attr(rfset$param, 'settings')$pythonModule, 'sklearn.ensemble') + expect_equal(attr(rfset$param, 'settings')$pythonClass, "RandomForestClassifier") inputs <- RandomForestClassifierInputs(list, rfset$param[[1]]) expect_equal( @@ -175,8 +155,6 @@ test_that("setRandomForest settings work checks", { }) - - test_that("setSVM settings work checks", { svmset <- setSVM ( @@ -187,23 +165,22 @@ test_that("setSVM settings work checks", { coef0 = list(0.0), shrinking = list(TRUE), tol = list(0.001), - classWeight = list('balanced', NULL), + classWeight = list(NULL), cacheSize = 500, seed = sample(100000,1) ) expect_equal(svmset$fitFunction, "fitSklearn") - expect_equal(length(svmset$param), 4*3*6*2) + expect_equal(length(svmset$param), 4*3*6*1) expect_equal(unique(lapply(svmset$param, function(x) x[[4]])), list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25)) expect_equal(unique(unlist(lapply(svmset$param, function(x) x[[1]]))), c(1,0.9,2,0.1)) expect_false(attr(svmset$param, 'settings')$requiresDenseMatrix) expect_equal(attr(svmset$param, 'settings')$name, 'Support Vector Machine') - expect_equal(attr(svmset$param, 'settings')$pythonImport, 'sklearn') - expect_equal(attr(svmset$param, 'settings')$pythonImportSecond, 'svm') - expect_equal(attr(svmset$param, 'settings')$pythonClassifier, "SVC") + expect_equal(attr(svmset$param, 'settings')$pythonModule, 'sklearn.svm') + expect_equal(attr(svmset$param, 'settings')$pythonClass, "SVC") inputs <- SVCInputs(list, svmset$param[[1]]) expect_equal( diff --git a/tests/testthat/test-sklearnClassifier_updated.R b/tests/testthat/test-sklearnClassifier_updated.R deleted file mode 100644 index 761a9850e..000000000 --- a/tests/testthat/test-sklearnClassifier_updated.R +++ /dev/null @@ -1,89 +0,0 @@ - - -test_that("DecisionTree settings work checks", { - -dtset <- setDecisionTree( - criterion = list('gini'), - splitter = list('best'), - maxDepth = list(4, 10, NULL), - minSamplesSplit = list(2, 10), - minSamplesLeaf = list(10, 50), - minWeightFractionLeaf = list(0), - maxFeatures = list(100,'auto', NULL), - maxLeafNodes = list(NULL), - minImpurityDecrease = list(10^-7), - classWeight = list(NULL, 'balanced'), - seed = sample(1000000,1) -) - -expect_equal(dtset$fitFunction, "fitSklearn") - -expect_equal(length(dtset$param), 3*2*2*3*2) - -expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[1]]))), 'gini') -expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[2]]))), 'best') -expect_equal(length(unique(lapply(dtset$param, function(x) x[[3]]))), 3) - -expect_false(attr(dtset$param, 'settings')$requiresDenseMatrix) -expect_equal(attr(dtset$param, 'settings')$name, 'Decision Tree') -expect_equal(attr(dtset$param, 'settings')$pythonImport, 'sklearn') -expect_equal(attr(dtset$param, 'settings')$pythonImportSecond, 'tree') -expect_equal(attr(dtset$param, 'settings')$pythonClassifier, "DecisionTreeClassifier") - - -}) - - -test_that("DecisionTree errors as expected", { - - expect_error(setDecisionTree(criterion = list('madeup'))) - - expect_error(setDecisionTree(maxDepth = list(-1))) - expect_error(setDecisionTree(minSamplesSplit = list(-1))) - expect_error(setDecisionTree(minSamplesLeaf = list(-1))) - -}) - - -test_that("check fit of DecisionTree", { - - - modelSettings <- setDecisionTree( - criterion = list('gini'), - splitter = list('best'), - maxDepth = list(as.integer(4)), - minSamplesSplit = list(2), - minSamplesLeaf = list(10), - minWeightFractionLeaf = list(0), - maxFeatures = list(as.integer(100),'auto'), - maxLeafNodes = list(NULL), - minImpurityDecrease = list(10^-7), - classWeight = list(NULL, 'balanced'), - seed = sample(1000000,1) - ) - trainData <- createTrainData( - plpData = plpData, - population = population - ) - - plpModel <- fitPlp( - trainData = trainData, - modelSettings = modelSettings, - analysisId = 'DecisionTree' - ) - - expect_equal(nrow(trainData$labels)*2, nrow(plpModel$prediction)) - expect_equal(length(unique(plpModel$prediction$evaluationType)), 2) - - expect_true(nrow(plpModel$covariateImportance) < trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - - expect_true(dir.exists(plpModel$model)) - expect_equal(dir(plpModel$model),"model.json") - - expect_equal(plpModel$trainDetails$outcomeId,2) - expect_equal(plpModel$trainDetails$cohortId,1) - -}) - -# add tests for other classifiers - diff --git a/tests/testthat/test-sklearnJson.R b/tests/testthat/test-sklearnJson.R new file mode 100644 index 000000000..6d36b5f07 --- /dev/null +++ b/tests/testthat/test-sklearnJson.R @@ -0,0 +1,131 @@ + + +sklearn <- reticulate::import('sklearn', convert=FALSE) +np <- reticulate::import('numpy', convert=FALSE) + +data <- sklearn$datasets$make_classification(n_samples=500L, n_features=3L, + n_classes=2L, n_informative=3L, + n_redundant=0L, random_state=0L, + shuffle=FALSE) + +X_unseen <- sklearn$datasets$make_classification(n_samples=100L, n_features=3L, + n_classes=2L, n_informative=3L, + n_redundant=0L, random_state=42L, + shuffle=FALSE)[[0]] +X <- data[[0]] +y <- data[[1]] + +test_that("Decision tree to json is correct", { + classifier <- sklearn$tree$DecisionTreeClassifier(max_depth=3L) + + model <- classifier$fit(X,y) + predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + path <- file.path(tempdir(),"model.json") + + sklearnToJson(model, path) + + loadedModel <- sklearnFromJson(path) + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) + + expect_true(all.equal(predictions, loadedPredictions)) +}) + +test_that("Random forest to json is correct", { + classifier <- sklearn$ensemble$RandomForestClassifier(n_estimators=10L) + + model <- classifier$fit(X,y) + predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + path <- file.path(tempdir(),"model.json") + + sklearnToJson(model, path) + + loadedModel <- sklearnFromJson(path) + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) + + expect_true(all.equal(predictions, loadedPredictions)) +}) + +test_that("Adaboost to json is correct", { + classifier <- sklearn$ensemble$AdaBoostClassifier(n_estimators=10L) + + model <- classifier$fit(X,y) + predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + path <- file.path(tempdir(), "model.json") + + sklearnToJson(model, path) + + loadedModel <- sklearnFromJson(path) + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) + + expect_true(all.equal(predictions, loadedPredictions)) +}) + +test_that("Naive Bayes to json is correct", { + classifier <- sklearn$naive_bayes$GaussianNB() + + model <- classifier$fit(X,y) + predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + path <- file.path(tempdir(), "model.json") + + sklearnToJson(model, path) + + loadedModel <- sklearnFromJson(path) + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) + + expect_true(all.equal(predictions, loadedPredictions)) +}) + +test_that("MLP to json is correct", { + # lower tolerance to not get convergence warning + classifier <- sklearn$neural_network$MLPClassifier(tol=1e-2) + + model <- classifier$fit(X,y) + predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + path <- file.path(tempdir(), "model.json") + + sklearnToJson(model, path) + + loadedModel <- sklearnFromJson(path) + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) + + expect_true(all.equal(predictions, loadedPredictions)) +}) + +test_that("SVM to json is correct", { + classifier <- sklearn$svm$SVC(probability=TRUE) + + # create sparse data because then some of the internal fields in the + # SVM will be sparse + feature_hasher <- sklearn$feature_extraction$FeatureHasher(n_features=3L) + random <- reticulate::import("random", convert=FALSE) + features <- list() + y_sparse <- np$empty(100L) + for (i in 1:100) { + row <- reticulate::dict(a=random$randint(0,2), + b=random$randint(3,5), + c=random$randint(6,8)) + features <- c(features, row) + reticulate::py_set_item(y_sparse, i - 1L, random$randint(0, 2)) + } + X_sparse <- feature_hasher$transform(features) + + model <- classifier$fit(X_sparse,y_sparse) + predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + path <- file.path(tempdir(), "model.json") + + sklearnToJson(model, path) + + loadedModel <- sklearnFromJson(path) + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) + + expect_true(all.equal(predictions, loadedPredictions)) +}) + + + diff --git a/tests/testthat/test-validation_updated.R b/tests/testthat/test-validation.R similarity index 82% rename from tests/testthat/test-validation_updated.R rename to tests/testthat/test-validation.R index 74a49bd91..84ee78516 100644 --- a/tests/testthat/test-validation_updated.R +++ b/tests/testthat/test-validation.R @@ -17,23 +17,7 @@ context("Validation") # Test unit for the creation of the study externalValidatePlp - -connectionDetails <- Eunomia::getEunomiaConnectionDetails() -Eunomia::createCohorts(connectionDetails) - -databaseDetails <- createDatabaseDetails( - connectionDetails = connectionDetails, - cdmDatabaseSchema = "main", - cdmDatabaseName = "main", - cohortDatabaseSchema = "main", - cohortTable = "cohort", - outcomeDatabaseSchema = "main", - outcomeTable = "cohort", - cohortId = 1, - outcomeIds = 3, #make this ids - cdmVersion = 5) - -modelVal <- plpResult$model +modelVal <- loadPlpModel(file.path(saveLoc, 'Test', 'plpResult', 'model')) validationDatabaseDetailsVal <- databaseDetails # from run multiple tests validationRestrictPlpDataSettingsVal <- createRestrictPlpDataSettings(washoutPeriod = 0, sampleSize = NULL) recalSet <- createValidationSettings(recalibrate = 'weakRecalibration') diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 000000000..097b24163 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/AddingCustomFeatureEngineering.Rmd b/vignettes/AddingCustomFeatureEngineering.Rmd index ebef0e25b..fab612c7f 100644 --- a/vignettes/AddingCustomFeatureEngineering.Rmd +++ b/vignettes/AddingCustomFeatureEngineering.Rmd @@ -1,6 +1,6 @@ --- title: "Adding Custom Feature Engineering Functions" -author: "Jenna Reps" +author: "Jenna Reps, Egill Fridgeirsson" date: "`r Sys.Date()`" header-includes: - \usepackage{fancyhdr} @@ -19,61 +19,63 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` + # Introduction -This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can add your own custom function for feature engineering in the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). **We invite you to share your new feature engineering functions with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** -# Feature Engineering Function Code Structure +# Feature Engineering Function Code Structure -To make a custom feature engineering function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. +To make a custom feature engineering function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. The 'create' function, e.g., create\, takes the parameters of the feature engineering 'implement' function as input, checks these are valid and outputs these as a list of class 'featureEngineeringSettings' with the 'fun' attribute specifying the 'implement' function to call. -The 'implement' function, e.g., implement\, must take as input: - * trainData - a list containing: - - covariateData: the plpData$covariateData restricted to the training patients - - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) - * featureEngineeringSettings - the output of your create\ - -The 'implement' function can then do any manipulation of the trainData (adding new features or removing features) but must output a trainData object containing the new covariateData, labels and folds for the training data patients. +The 'implement' function, e.g., implement\, must take as input: + +- `trainData` - a list containing: + + - `covariateData`: the `plpData$covariateData`restricted to the training patients + + - `labels`: a data frame that contain `rowId`(patient identifier) and `outcomeCount` (the class labels) + + - `folds`: a data.frame that contains `rowId` (patient identifier) and `index` (the cross validation fold) + +- `featureEngineeringSettings` - the output of your create\ + +The 'implement' function can then do any manipulation of the `trainData` (adding new features or removing features) but must output a `trainData` object containing the new `covariateData`, `labels` and `folds` for the training data patients. # Example -Let's consider the situation where we wish to create an age spline feature. To make this custom feature engineering function we need to write the 'create' and 'implement' R functions. +Let's consider the situation where we wish to create an age spline feature. To make this custom feature engineering function we need to write the 'create' and 'implement' R functions. ## Create function -Our age spline feature function will create a new feature using the plpData$cohorts ageYear column. We will implement a restricted cubic spline that requires specifying the number of knots. -. Therefore, the inputs for this are: - * `knots` an integer/double specifying the number of knots - +Our age spline feature function will create a new feature using the `plpData$cohorts$ageYear` column. We will implement a restricted cubic spline that requires specifying the number of knots. Therefore, the inputs for this are: `knots` - an integer/double specifying the number of knots. + ```{r, echo = TRUE, eval=FALSE} -createAgeSpine <- function( +createAgeSpline <- function( knots = 5 ){ - # add input checks - checkIsClass(knots, c('numeric','integer')) - checkHigher(knots,0) - # create list of inputs to implement function featureEngineeringSettings <- list( knots = knots ) # specify the function that will implement the sampling - attr(featureEngineeringSettings, "fun") <- "implementAgeSpine" + attr(featureEngineeringSettings, "fun") <- "implementAgeSplines" # make sure the object returned is of class "sampleSettings" class(featureEngineeringSettings) <- "featureEngineeringSettings" @@ -82,64 +84,76 @@ createAgeSpine <- function( } ``` -We now need to create the 'implement' function `implementAgeSpine()` +We now need to create the 'implement' function `implementAgeSplines()` ## Implement function -All 'implement' functions must take as input the trainData and the featureEngineeringSettings (this is the output of the 'create' function). They must return a trainData object containing the new covariateData, labels and folds. +All 'implement' functions must take as input the `trainData` and the `featureEngineeringSettings` (this is the output of the 'create' function). They must return a `trainData` object containing the new `covariateData`, `labels` and `folds`. -In our example, the `createAgeSpine()` will return a list with 'knots'. The featureEngineeringSettings therefore contains this. +In our example, the `createAgeSpline()` will return a list with 'knots'. The `featureEngineeringSettings` therefore contains this. ```{r tidy=FALSE,eval=FALSE} -implementAgeSpine <- function(trainData, featureEngineeringSettings){ - - # currently not used - knots <- featureEngineeringSettings$knots +implementAgeSplines <- function(trainData, featureEngineeringSettings, model=NULL) { + # if there is a model, it means this function is called through applyFeatureengineering, meaning it # should apply the model fitten on training data to the test data + if (is.null(model)) { + knots <- featureEngineeringSettings$knots + ageData <- trainData$labels + y <- ageData$outcomeCount + X <- ageData$ageYear + model <- mgcv::gam( + y ~ s(X, bs='cr', k=knots, m=2) + ) + newData <- data.frame( + rowId = ageData$rowId, + covariateId = 2002, + covariateValue = model$fitted.values + ) + } + else { + ageData <- trainData$labels + X <- trainData$labels$ageYear + y <- ageData$outcomeCount + newData <- data.frame(y=y, X=X) + yHat <- predict(model, newData) + newData <- data.frame( + rowId = trainData$labels$rowId, + covariateId = 2002, + covariateValue = yHat + ) + } + # remove existing age if in covariates + trainData$covariateData$covariates <- trainData$covariateData$covariates |> + dplyr::filter(!covariateId %in% c(1002)) - # age in in trainData$labels as ageYear - ageData <- trainData$labels + # update covRef + Andromeda::appendToTable(trainData$covariateData$covariateRef, + data.frame(covariateId=2002, + covariateName='Cubic restricted age splines', + analysisId=2, + conceptId=2002)) - # now implement the code to do your desired feature engineering + # update covariates + Andromeda::appendToTable(trainData$covariateData$covariates, newData) - data <- Matrix::sparseMatrix( - i = 1:length(ageData$rowId), - j = rep(1, length(ageData$rowId)), - x = ageData$ageYear, - dims=c(length(ageData$rowId),1) + featureEngineering <- list( + funct = 'implementAgeSplines', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + model = model + ) ) - data <- as.matrix(data) - x <- data[,1] - y <- ageData$outcomeCount - -mRCS <- rms::ols( - y~rms::rcs(x, - stats::quantile( - x, - c(0, .05, .275, .5, .775, .95, 1), - include.lowest = TRUE - ) - ) - ) - -newData <- data.frame( - rowId = ageData$rowId, - covariateId = 2002, - covariateValue = mRCS$fitted.values + attr(trainData$covariateData, 'metaData')$featureEngineering = listAppend( + attr(trainData$covariateData, 'metaData')$featureEngineering, + featureEngineering ) - -# add new data -Andromeda::appendToTable(tbl = trainData$covariateData$covariates, - data = newData) - # return the updated trainData return(trainData) } ``` - # Acknowledgments Considerable work has been dedicated to provide the `PatientLevelPrediction` package. @@ -152,8 +166,4 @@ citation("PatientLevelPrediction") [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - This work is supported in part through the National Science Foundation grant IIS 1251151. - - - diff --git a/vignettes/AddingCustomModels.Rmd b/vignettes/AddingCustomModels.Rmd index 801734dba..84b21e7b4 100644 --- a/vignettes/AddingCustomModels.Rmd +++ b/vignettes/AddingCustomModels.Rmd @@ -19,30 +19,35 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` + # Introduction -This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can add your own custom algorithms in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This allows you to fully leverage the OHDSI PatientLevelPrediction framework for model development and validation. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). **We invite you to share your new algorithms with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** -# Algorithm Code Structure -Each algorithm in the package should be implemented in its own \.R file, e.g. KNN.R, containing a set\ function, a fit\ function and a predict\ function. Occasionally the fit and prediction functions may be reused (if using an R classifier see RClassifier.R or if using a scikit-learn classifier see SklearnClassifier.R). We will now describe each of these functions in more detail below. +# Algorithm Code Structure + +Each algorithm in the package should be implemented in its own \.R file, e.g. KNN.R, containing a set\ function, a fit\ function and a predict\ function. Occasionally the fit and prediction functions may be reused (if using an R classifier see RClassifier.R or if using a scikit-learn classifier see SklearnClassifier.R). We will now describe each of these functions in more detail below. ## Set + The set\ is a function that takes as input the different hyper-parameter values to do a grid search when training. The output of the functions needs to be a list as class `modelSettings` containing: -+ param \- all the combinations of the hyper-parameter values input -+ fitFunction \- a string specifying what function to call to fit the model +- param - all the combinations of the hyper-parameter values input +- fitFunction - a string specifying what function to call to fit the model -The param object can have a setttings attribute containing any extra settings. For example to specify the model name and the seed used for reproducibility: +The param object can have a setttings attribute containing any extra settings. For example to specify the model name and the seed used for reproducibility: ```{r, echo = TRUE, eval=FALSE} attr(param, 'settings') <- list( @@ -84,53 +89,60 @@ setMadeUp <- function(a=c(1,4,10), b=2, seed=NULL){ ``` ## Fit -This function should train your custom model for each parameter entry, pick the best parameters and train a final model for that setting. + +This function should train your custom model for each parameter entry, pick the best parameters and train a final model for that setting. The fit\ should have as inputs: - * trainData \- a list containing the covariateData, labels and folds for the training population - * param \- the hyper-parameters as a list of all combinations - * search \- the type of hyper-parameter search - * analysisId \- an identifier for the analysis +- trainData - a list containing the covariateData, labels and folds for the training population +- param - the hyper-parameters as a list of all combinations +- search - the type of hyper-parameter search +- analysisId - an identifier for the analysis The fit function should return a list of class `plpModel` with the following objects: - * model \- a trained model (or location of the model if it is not an R object) - * prediction \- a data.frame object with the trainData$labels plus an extra column with the name 'value' corresponding to the predicted risk of having the outcome during the time-at-risk. - * settings \- a list containing: - + plpDataSettings \- the plpData settings e.g., attr(trainData, "metaData")$plpDataSettings - + covariateSettings \- the covariate settings e.g., attr(trainData, "metaData")$covariateSettings - + populationSettings \- the population settings e.g., attr(trainData, "metaData")$populationSettings, - + featureEngineering \- the feature engineering settings e.g., attr(trainData$covariateData, "metaData")$featureEngineering, - + tidyCovariates \- the preprocessing settings e.g., attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - + requireDenseMatrix \- does the model require a dense matrix? e.g., attr(param, 'settings')$requiresDenseMatrix, - + modelSettings = a list containing: model (model name), param (the hyper-parameter search list), finalModelParameters (the final model hyper-parameters), extraSettings (any extra settings) - + splitSettings \- the split settings e.g., attr(trainData, "metaData")$splitSettings, - + sampleSettings \- the sample settings e.g., attr(trainData, "metaData")$sampleSettings - * trainDetails \- a list containing: - + analysisId \- the identifier for the analysis - + cdmDatabaseSchema \- the database used to develop the model - + outcomeId \- the outcome id - + cohortId \- the target population id - + attrition \- the attrition - + trainingTime \- how long it took to train the model - + trainingDate \- date of model training - + hyperParamSearch \- the hyper-parameter search used to train the model - * covariateImportance \- a data.frame containing the columns 'covariateId', 'covariateValue' (the variable importance) and 'columnId' (the column number that the variable need to be mapped to when implementing the model) - +- model - a trained model (or location of the model if it is not an R object) +- prediction - a data.frame object with the trainData\$labels plus an extra column with the name 'value' corresponding to the predicted risk of having the outcome during the time-at-risk. +- preprocessing - the settings required to preprocess the data when applying the model + - featureEngineering - the feature engineering settings e.g., attr(trainData$covariateData, "metaData")$featureEngineering, + - tidyCovariates - the preprocessing settings e.g., attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + - requireDenseMatrix - does the model require a dense matrix? e.g., attr(param, 'settings')\$requiresDenseMatrix, +- modelDesign - a list containing: + - targetId - the id of the target cohort + - outcomeId - the id of the outcome cohort + - plpDataSettings - the plpData settings e.g., attr(trainData, "metaData")\$plpDataSettings + - covariateSettings - the covariate settings e.g., attr(trainData, "metaData")\$covariateSettings + - populationSettings - the population settings e.g., attr(trainData, "metaData")\$populationSettings, + - featureEngineeringSettings - the feature engineering settings e.g., attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + - preprocessSettings - the preprocessing settings e.g., attr(trainData$covariateData, "metaData")$preprocessSettings, + - modelSettings = a list containing: model (model name), param (the hyper-parameter search list), finalModelParameters (the final model hyper-parameters), extraSettings (any extra settings) + - splitSettings - the split settings e.g., attr(trainData, "metaData")\$splitSettings, + - sampleSettings - the sample settings e.g., attr(trainData, "metaData")\$sampleSettings +- trainDetails - a list containing: + - analysisId - the identifier for the analysis + - developmentDatabase - the database used to develop the model + - attrition - the attrition + - trainingTime - how long it took to train the model + - trainingDate - date of model training + - hyperParamSearch - the hyper-parameter search used to train the model + - any other objects specific to training +- covariateImportance - a data.frame containing the columns 'covariateId', 'covariateValue' (the variable importance) and 'columnId' (the column number that the variable need to be mapped to when implementing the model) + In additon the plpModel requires two attributes: - * predictionFunction - the name of the function used to make predictions - * modelType - whether the model is 'binary' or 'survival' +- predictionFunction - the name of the function used to make predictions +- modelType - whether the model is 'binary' or 'survival' -For example `attr(result, 'predictionFunction') <- 'madeupPrediction'` means when the model is applied to new data, the 'madeupPrediction' function is called to make predictions. If this doesnt exist, then the model will fail. The other attribute is the modelType `attr(result, 'modelType') <- 'binary'` this is needed when evaluating the model to ensure the correct evaluation is applied. Currently the evaluation supports 'binary' and 'survival' modelType. +For example `attr(result, 'predictionFunction') <- 'madeupPrediction'` means when the model is applied to new data, the 'madeupPrediction' function is called to make predictions. If this doesnt exist, then the model will fail. The other attribute is the modelType `attr(result, 'modelType') <- 'binary'` this is needed when evaluating the model to ensure the correct evaluation is applied. Currently the evaluation supports 'binary' and 'survival' modelType. -Note: If a new modelType is desired, then the evalaution code within PatientLevelPrediction must be updated to specify how the new type is evaluated. This requires making edits to PatientLevelPrediction and then making a pull request to the PatientLevelPrediction github. The evaluation cannot have one off customization because the evaluation must be standardized to enable comparison across similar models. +Note: If a new modelType is desired, then the evalaution code within PatientLevelPrediction must be updated to specify how the new type is evaluated. This requires making edits to PatientLevelPrediction and then making a pull request to the PatientLevelPrediction github. The evaluation cannot have one off customization because the evaluation must be standardized to enable comparison across similar models. A full example of a custom 'binary' classifier fit function is: ```{r tidy=FALSE,eval=FALSE} -fitMadeUp <- function(trainData, param, search, analysisId){ +fitMadeUp <- function(trainData, modelSettings, search, analysisId){ + + param <- modelSettings$param # **************** code to train the model here # trainedModel <- this code should apply each hyper-parameter combination @@ -152,13 +164,20 @@ fitMadeUp <- function(trainData, param, search, analysisId){ # construct the standard output for a model: result <- list(model = trainedModel, prediction = prediction, # the train and maybe the cross validation predictions for the trainData - settings = list( + preprocessing = list( + featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, + + ), + modelDesign = list( + outcomeId = attr(trainData, "metaData")$outcomeId, + targetId = attr(trainData, "metaData")$targetId, plpDataSettings = attr(trainData, "metaData")$plpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, modelSettings = list( model = attr(param, 'settings')$modelName, # the model name param = param, @@ -171,9 +190,7 @@ fitMadeUp <- function(trainData, param, search, analysisId){ trainDetails = list( analysisId = analysisId, - cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - outcomeId = attr(trainData, "metaData")$outcomeId, - cohortId = attr(trainData, "metaData")$cohortId, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, trainingTime = timeToTrain, # how long it took to train the model trainingDate = Sys.Date(), @@ -189,12 +206,14 @@ fitMadeUp <- function(trainData, param, search, analysisId){ } ``` -You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function (for example a function to run cross validation). It is important to ensure there is a valid prediction function (the one specified by `attr(result, 'predictionFunction') <- 'madeupPrediction'` is `madeupPrediction()`) as specified below. + +You could make the fitMadeUp function cleaner by adding helper function in the MadeUp.R file that are called by the fit function (for example a function to run cross validation). It is important to ensure there is a valid prediction function (the one specified by `attr(result, 'predictionFunction') <- 'madeupPrediction'` is `madeupPrediction()`) as specified below. ## Predict -The prediction function takes as input the plpModel returned by fit, new data and a corresponding cohort. It returns a data.frame with the same columns as cohort but with an additional column: - * value \- the predicted risk from the plpModel for each patient in the cohort +The prediction function takes as input the plpModel returned by fit, new data and a corresponding cohort. It returns a data.frame with the same columns as cohort but with an additional column: + +- value - the predicted risk from the plpModel for each patient in the cohort For example: @@ -215,7 +234,7 @@ madeupPrediction <- function(plpModel, data, cohort){ # Algorithm Example -Below a fully functional algorithm example is given, however we highly recommend you to have a look at the available algorithms in the package (see GradientBoostingMachine.R for the set function, RClassifier.R for the fit and prediction function for R classifiers). +Below a fully functional algorithm example is given, however we highly recommend you to have a look at the available algorithms in the package (see GradientBoostingMachine.R for the set function, RClassifier.R for the fit and prediction function for R classifiers). ## Set @@ -253,12 +272,13 @@ setMadeUp <- function(a=c(1,4,6), b=2, seed=NULL){ ``` +## Fit -## Fit ```{r tidy=FALSE,eval=FALSE} -fitMadeUp <- function(trainData, param, search, analysisId){ +fitMadeUp <- function(trainData, modelSettings, search, analysisId){ # set the seed for reproducibility + param <- modelSettings$param set.seed(attr(param, 'settings')$seed) # add folds to labels: @@ -324,13 +344,20 @@ fitMadeUp <- function(trainData, param, search, analysisId){ # construct the standard output for a model: result <- list(model = trainedModel, prediction = prediction, - settings = list( + preprocessing = list( + featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, + + ), + modelDesign = list( + outcomeId = attr(trainData, "metaData")$outcomeId, + targetId = attr(trainData, "metaData")$targetId, plpDataSettings = attr(trainData, "metaData")$plpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix, + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, modelSettings = list( model = attr(param, 'settings')$modelName, # the model name param = param, @@ -343,14 +370,14 @@ fitMadeUp <- function(trainData, param, search, analysisId){ trainDetails = list( analysisId = analysisId, - cdmDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, - outcomeId = attr(trainData, "metaData")$outcomeId, - cohortId = attr(trainData, "metaData")$cohortId, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, trainingTime = timeToTrain, # how long it took to train the model trainingDate = Sys.Date(), hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame ), + covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by='covariateId') # add variable importance to covariateRef if possible + ), covariateImportance = varImp ) class(result) <- 'plpModel' @@ -363,6 +390,7 @@ fitMadeUp <- function(trainData, param, search, analysisId){ ``` ## Helpers + In the fit model a helper function `made_up_model` is called, this is the function that trains a model given the data, labels and hyper-parameters. ```{r tidy=FALSE,eval=FALSE} @@ -419,7 +447,8 @@ made_up_model <- function(param, data, final=F, labels){ ``` ## Predict -The final step is to create a predict function for the model. In the example above the predeiction function `attr(result, 'predictionFunction') <- 'madeupPrediction'` was madeupPrediction, so a `madeupPrediction` function is required when applying the model. The predict function needs to take as input the plpModel returned by the fit function, new data to apply the model on and the cohort specifying the patients of interest to make the prediction for. + +The final step is to create a predict function for the model. In the example above the predeiction function `attr(result, 'predictionFunction') <- 'madeupPrediction'` was madeupPrediction, so a `madeupPrediction` function is required when applying the model. The predict function needs to take as input the plpModel returned by the fit function, new data to apply the model on and the cohort specifying the patients of interest to make the prediction for. ```{r tidy=FALSE,eval=FALSE} madeupPrediction <- function(plpModel, data , cohort){ @@ -430,7 +459,7 @@ madeupPrediction <- function(plpModel, data , cohort){ plpData = data, cohort = cohort, map = plpModel$covariateImportance %>% - dplyr::select(.data$columnId, .data$covariateId) + dplyr::select("columnId", "covariateId") ) newData <- matrixObjects$dataMatrix @@ -451,14 +480,15 @@ madeupPrediction <- function(plpModel, data , cohort){ # fix the rowIds to be the old ones # now use the originalRowId and remove the matrix rowId cohort <- cohort %>% - dplyr::select(-.data$rowId) %>% - dplyr::rename(rowId = .data$originalRowId) + dplyr::select(-"rowId") %>% + dplyr::rename(rowId = "originalRowId") attr(cohort, "metaData") <- list(modelType = attr(plpModel, 'modelType')) return(cohort) } ``` + As the madeup model uses the standard R prediction, it has the same prediction function as xgboost, so we could have not added a new prediction function and instead made the predictionFunction of the result returned by fitMadeUpModel to `attr(result, 'predictionFunction') <- 'predictXgboost'`. # Acknowledgments @@ -473,8 +503,4 @@ citation("PatientLevelPrediction") [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - This work is supported in part through the National Science Foundation grant IIS 1251151. - - - diff --git a/vignettes/AddingCustomSamples.Rmd b/vignettes/AddingCustomSamples.Rmd index 4f08a8791..eb2da04f1 100644 --- a/vignettes/AddingCustomSamples.Rmd +++ b/vignettes/AddingCustomSamples.Rmd @@ -7,10 +7,10 @@ header-includes: - \pagestyle{fancy} - \fancyhead{} - \fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - \fancyfoot[LE,RO]{\thepage} - \renewcommand{\headrulewidth}{0.4pt} - \renewcommand{\footrulewidth}{0.4pt} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} output: pdf_document: number_sections: yes @@ -18,46 +18,73 @@ output: html_document: number_sections: yes toc: yes +editor_options: + markdown: + wrap: 72 --- + +```{=html} - +``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` -# Introduction - -This vignette describes how you can add your own custom function for sampling the target population in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). - -**We invite you to share your new sample functions with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** - -# Sample Function Code Structure -To make a sampling function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. - -The 'create' function, e.g., create\, takes the parameters of the sample 'implement' function as input, checks these are valid and outputs these as a list of class 'sampleSettings' with the 'fun' attribute specifying the 'implement' function to call. +# Introduction -The 'implement' function, e.g., implement\, must take as input: - * trainData - a list containing: - - covariateData: the plpData$covariateData restricted to the training patients - - labels: a data frame that contain rowId (patient identifier) and outcomeCount (the class labels) - - folds: a data.frame that contains rowId (patient identifier) and index (the cross validation fold) - * sampleSettings - the output of your create\ - -The 'implement' function can then do any manipulation of the trainData (such as undersampling or oversampling) but must output a trainData object containing the covariateData, labels and folds for the new training data sample. +This vignette describes how you can add your own custom function for +sampling the target population in the Observational Health Data Sciencs +and Informatics (OHDSI) +[`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) +package. This vignette assumes you have read and are comfortable with +building single patient level prediction models as described in the +[`BuildingPredictiveModels` +vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). + +**We invite you to share your new sample functions with the OHDSI +community through our [GitHub +repository](http://github.com/OHDSI/PatientLevelPrediction).** + +# Sample Function Code Structure + +To make a sampling function that can be used within +PatientLevelPrediction you need to write two different functions. The +'create' function and the 'implement' function. + +The 'create' function, e.g., create\, takes the +parameters of the sample 'implement' function as input, checks these are +valid and outputs these as a list of class 'sampleSettings' with the +'fun' attribute specifying the 'implement' function to call. + +The 'implement' function, e.g., implement\, must +take as input: \* trainData - a list containing: - covariateData: the +plpData\$covariateData restricted to the training patients - labels: a +data frame that contain rowId (patient identifier) and outcomeCount (the +class labels) - folds: a data.frame that contains rowId (patient +identifier) and index (the cross validation fold) \* sampleSettings - +the output of your create\ + +The 'implement' function can then do any manipulation of the trainData +(such as undersampling or oversampling) but must output a trainData +object containing the covariateData, labels and folds for the new +training data sample. # Example -Let's consider the situation where we wish to take a random sample of the training data population. To make this custom sampling function we need to write the 'create' and 'implement' R functions. +Let's consider the situation where we wish to take a random sample of +the training data population. To make this custom sampling function we +need to write the 'create' and 'implement' R functions. ## Create function -Our random sampling function will randomly sample `n` patients from the trainData. Therefore, the inputs for this are: - * `n` an integer/double specifying the number of patients to sample - * `sampleSeed` an integer/double specifying the seed for reproducibility - +Our random sampling function will randomly sample `n` patients from the +trainData. Therefore, the inputs for this are: \* `n` an integer/double +specifying the number of patients to sample \* `sampleSeed` an +integer/double specifying the seed for reproducibility + ```{r, echo = TRUE, eval=FALSE} createRandomSampleSettings <- function( n = 10000, @@ -85,13 +112,18 @@ createRandomSampleSettings <- function( } ``` -We now need to create the 'implement' function `implementRandomSampleSettings()` +We now need to create the 'implement' function +`implementRandomSampleSettings()` ## Implement function -All 'implement' functions must take as input the trainData and the sampleSettings (this is the output of the 'create' function). They must return a trainData object containing the covariateData, labels and folds. +All 'implement' functions must take as input the trainData and the +sampleSettings (this is the output of the 'create' function). They must +return a trainData object containing the covariateData, labels and +folds. -In our example, the `createRandomSampleSettings()` will return a list with 'n' and 'sampleSeed'. The sampleSettings therefore contains these. +In our example, the `createRandomSampleSettings()` will return a list +with 'n' and 'sampleSeed'. The sampleSettings therefore contains these. ```{r tidy=FALSE,eval=FALSE} implementRandomSampleSettings <- function(trainData, sampleSettings){ @@ -138,10 +170,10 @@ implementRandomSampleSettings <- function(trainData, sampleSettings){ ``` - # Acknowledgments -Considerable work has been dedicated to provide the `PatientLevelPrediction` package. +Considerable work has been dedicated to provide the +`PatientLevelPrediction` package. ```{r tidy=TRUE,eval=TRUE} citation("PatientLevelPrediction") @@ -149,10 +181,11 @@ citation("PatientLevelPrediction") **Please reference this paper if you use the PLP Package in your work:** -[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - - -This work is supported in part through the National Science Foundation grant IIS 1251151. - - +[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and +implementation of a standardized framework to generate and evaluate +patient-level prediction models using observational healthcare data. J +Am Med Inform Assoc. +2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) +This work is supported in part through the National Science Foundation +grant IIS 1251151. diff --git a/vignettes/AddingCustomSplitting.Rmd b/vignettes/AddingCustomSplitting.Rmd index b8b22d60e..96cef75d3 100644 --- a/vignettes/AddingCustomSplitting.Rmd +++ b/vignettes/AddingCustomSplitting.Rmd @@ -19,31 +19,32 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` + # Introduction -This vignette describes how you can add your own custom function for splitting the labelled data into training data and validation data in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can add your own custom function for splitting the labelled data into training data and validation data in the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/BuildingPredictiveModels.pdf). **We invite you to share your new data splitting functions with the OHDSI community through our [GitHub repository](http://github.com/OHDSI/PatientLevelPrediction).** -# Data Splitting Function Code Structure +# Data Splitting Function Code Structure -To make a custom data splitting function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. +To make a custom data splitting function that can be used within PatientLevelPrediction you need to write two different functions. The 'create' function and the 'implement' function. The 'create' function, e.g., create\, takes the parameters of the data splitting 'implement' function as input, checks these are valid and outputs these as a list of class 'splitSettings' with the 'fun' attribute specifying the 'implement' function to call. -The 'implement' function, e.g., implement\, must take as input: - * population: a data frame that contain rowId (patient identifier), ageYear, gender and outcomeCount (the class labels) - * splitSettings - the output of your create\ - -The 'implement' function then needs to implement code to assign each rowId in the population to a splitId (<0 means in the train data, 0 means not used and >0 means in the training data with the value defining the cross validation fold). +The 'implement' function, e.g., implement\, must take as input: \* population: a data frame that contain rowId (patient identifier), ageYear, gender and outcomeCount (the class labels) \* splitSettings - the output of your create\ + +The 'implement' function then needs to implement code to assign each rowId in the population to a splitId (\<0 means in the train data, 0 means not used and \>0 means in the training data with the value defining the cross validation fold). # Example @@ -52,7 +53,7 @@ Let's consider the situation where we wish to create a split where females are u ## Create function Our gender split function requires a single parameter, the number of folds used in cross validation. Therefore create a function with a single nfold input that returns a list of class 'splitSettings' with the 'fun' attribute specifying the 'implement' function we will use. - + ```{r, echo = TRUE, eval=FALSE} createGenderSplit <- function(nfold) { @@ -74,10 +75,9 @@ We now need to create the 'implement' function `implementGenderSplit()` ## Implement function -All 'implement' functions for data splitting must take as input the population and the splitSettings (this is the output of the 'create' function). They must return a data.frame containing columns: rowId and index. - -The index is used to determine whether the patient (identifed by the rowId) is in the test set (index = -1) or train set (index > 0). In in the train set, the value corresponds to the cross validation fold. For example, if rowId 2 is assigned index 5, then it means the patient with the rowId 2 is used to train the model and is in fold 5. +All 'implement' functions for data splitting must take as input the population and the splitSettings (this is the output of the 'create' function). They must return a data.frame containing columns: rowId and index. +The index is used to determine whether the patient (identifed by the rowId) is in the test set (index = -1) or train set (index \> 0). In in the train set, the value corresponds to the cross validation fold. For example, if rowId 2 is assigned index 5, then it means the patient with the rowId 2 is used to train the model and is in fold 5. ```{r tidy=FALSE,eval=FALSE} implementGenderSplit <- function(population, splitSettings){ @@ -100,7 +100,6 @@ implementGenderSplit <- function(population, splitSettings){ ``` - # Acknowledgments Considerable work has been dedicated to provide the `PatientLevelPrediction` package. @@ -113,8 +112,4 @@ citation("PatientLevelPrediction") [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - This work is supported in part through the National Science Foundation grant IIS 1251151. - - - diff --git a/vignettes/BenchmarkTasks.Rmd b/vignettes/BenchmarkTasks.Rmd new file mode 100644 index 000000000..d046b5890 --- /dev/null +++ b/vignettes/BenchmarkTasks.Rmd @@ -0,0 +1,75 @@ +--- +title: "Benchmark Tasks" +author: "Jenna Reps, Ross Williams, Peter R. Rijnbeek" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyhead[CO,CE]{Installation Guide} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` +## Benchmark Tasks For Large-Scale Empirical Analyses + +Here we provide a set of diverse prediction tasks that can be used when evaluating the impact of the model design choice when developing models using observational data. + ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Target Cohort (index) | Outcome | Time-at-risk | Link | ++===============================================================================================================================================================================================================================================================================================================================================================================================================================================================================================+=========================================================================================================================================================+=============================+======+ +| Patients with an outpatient visit in 2017 with no prior cancer (first visit in 2017) | Lung cancer | 1 day - 3 years after index | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients newly diagnosed with major depressive disorder (date of first record) | Bipolar | 1 day - 365 day after index | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with an outpatient visit in 2019 | Dementia | 1 day - 3 years after index | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with an outpatient visit and a positive COVID test | Hospitalization with pneumonia | 1 day - 30 days after index | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with an outpatient visit and a positive COVID test | Hospitalization with pneumonia that required intensive services (ventilation, intubation, tracheotomy, or extracorporeal membrane oxygenation) or death | 1 day - 30 days after index | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with an outpatient visit and a positive COVID test | Death | 1 day - 30 days after index | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with T2DM who were treated with metformin and who became new adult users of one of sulfonylureas, thiazolidinediones, dipeptidyl peptidase-4 inhibitors, glucagon-like peptide-1 receptor agonists, or sodium-glucose co-transporter-2 inhibitors (date of secondary drug). Patients with HF or patients treated with insulin on or prior to the index date were excluded from the analysis. Patients were required to have been enrolled for at least 365 days before cohort entry. | Heart Failure | 1 to 365 days | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients newly diagnosed with atrial fibrilation (date of initial afib record) | Ischemic stroke | 1 to 365 days | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients undergoing elective major non-cardiac surgery (date of surgery). Patients were required to have been enrolled for at least 365 days before cohort entry. | Earliest of AMI cardiac arrest or death (MACE) | O to 30 days | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients starting intravitreal Anti-VEGF (date of administration) | Kidney Failure | 1 to 365 days | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Pregnancy women (start of pregnancy) | Preeclampsia | During pregnancy | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Pregnancy women (start of pregnancy) | Still birth | During pregnancy | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with COPD (first record) | Cardiovascular event and death | 1-30 days and 1-90 days | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients starting menopause (first record) | Depression | 1 day - 3-years | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with anemia (date of first anemia record) | Colorectal cancer | 1 day - 1-year | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patients with quadriplegia (date of first quadriplegia record) | Death | 1 day - 1-year | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| Patient undergoing | | | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ +| | | | | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------+------+ diff --git a/vignettes/BestPractices.rmd b/vignettes/BestPractices.rmd index 9c27f3f7c..bfc1bc792 100644 --- a/vignettes/BestPractices.rmd +++ b/vignettes/BestPractices.rmd @@ -101,7 +101,7 @@ Data Creation Impact of over/under-sampling -Study being developed + Journal of big data @@ -125,7 +125,7 @@ Model development How much data do we need for prediction - Learning curves at scale -Preprint link +International Journal of Medical Informatics @@ -165,6 +165,18 @@ Study needs to be done + + +Model development + + +Can we use ensembles to combine different algorithm models within a database to improve models transportability? + + + Caring is Sharing–Exploiting the Value in Data for Health and Innovation + + + Model development @@ -173,10 +185,59 @@ Model development Can we use ensembles to combine models developed using different databases to improve models transportability? - Paper under review at BMC + BMC Medical Informatics and Decision Making + + + + + +Model development + + +Impact of regularization method + + + JAMIA + + + + + +Evaluation + + +Why prediction is not suitable for risk factor identification + + + Machine Learning for Healthcare Conference + + + + + +Evaluation + + +Iterative pairwise external validation to put validation into context + + + Drug Safety + + + +Evaluation + + +A novel method to estimate external validation using aggregate statistics + + + Study under review + + + Evaluation @@ -225,4 +286,4 @@ Is there a way to automatically simplify models? - \ No newline at end of file + diff --git a/vignettes/BuildingEnsembleModels.Rmd b/vignettes/BuildingEnsembleModels.Rmd deleted file mode 100644 index 3c052b18d..000000000 --- a/vignettes/BuildingEnsembleModels.Rmd +++ /dev/null @@ -1,175 +0,0 @@ ---- -title: "Building Ensemble Models" -author: "Xiaoyong Pan, Jenna Reps, Peter R. Rijnbeek" -date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[CO,CE]{Installation Guide} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[LE,RO]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - word_document: - toc: yes - html_document: - number_sections: yes - toc: yes ---- - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -# Introduction -Ensemble models combine several models to improve the overall performance. Traditionally, weak learners were combined to boost performance but recent results show that combining several strong approaches can also result in a better performance. There are many examples in literature where ensemble models outperform individual models using stacking, i.e. a final logistic regresssion layer accross the individual model outputs, but other approaches like weigthing has also shown promising results. - -This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to build ensemble models. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). - -This will enable studying ensemble methods at scale in the OHDSI data network. - -![Ensemble model](ensemble.png) - -In PatientLevelPrediction package, four ensemble strategies have been implemented: - -1. average ensemble: Calculate the average probability from individual models -2. product ensemble: Calculate the product of probabilites from individual models. -3. weighted ensemble: Calculate the weighted average probability from individual models using train AUC as weights. -4. stacked ensemble: Train a logistics regression on outputs from individual models - -# Usage - -Use the [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to generate a `population` and `plpData` object. Alternatively, you can make use of the data simulator. The following code snippet creates a population of 12000 patients. - -```{r eval=FALSE} -data(plpDataSimulationProfile) -set.seed(1234) -sampleSize <- 2000 -plpData <- simulatePlpData( - plpDataSimulationProfile, - n = sampleSize -) - -population <- createStudyPopulation( - plpData, - outcomeId = 2, - binary = TRUE, - firstExposureOnly = FALSE, - washoutPeriod = 0, - removeSubjectsWithPriorOutcome = FALSE, - priorOutcomeLookback = 99999, - requireTimeAtRisk = FALSE, - minTimeAtRisk = 0, - riskWindowStart = 0, - addExposureDaysToStart = FALSE, - riskWindowEnd = 365, - addExposureDaysToEnd = FALSE, - verbosity = "INFO" -) -``` - -Specify the prediction algorithms to be combined. - -```{r eval=FALSE} -# Use LASSO logistic regression and Random Forest as base predictors -model1 <- setLassoLogisticRegression() -model2 <- setRandomForest() -``` - -Specify a test fraction and a sequence of training set fractions. - -```{r eval = FALSE} -testFraction <- 0.2 -``` - -Specify an ensembleStrategy to combine multiple predictors. -The strategy used for ensembling the outputs from different models, -it can be 'mean', 'product', 'weighted' and 'stacked': -'mean' the average probability from differnt models -'product' the product rule -'weighted' the weighted average probability from different models using train AUC as weights. -'stacked' the stakced ensemble trains a logistics regression on different models. - -```{r eval = FALSE} -ensembleStrategy <- 'stacked' -``` - -Specify the test split to be used. - -```{r} -# Use a split by person, alterantively a time split is possible -testSplit <- 'person' -``` - -Run the ensemble learning to combine model1 and model2. You can also use different plpData for different models. - -```{r eval=FALSE} -ensembleResults <- PatientLevelPrediction::runEnsembleModel(population, - dataList = list(plpData, plpData), - modelList = list(model1, model2), - testSplit=testSplit, - testFraction=testFraction, - nfold=3, splitSeed=1000, - ensembleStrategy = ensembleStrategy) -``` -## Saving and loading the ensemble model - -You can save and load the model using: - -```{r tidy=TRUE,eval=FALSE} -saveEnsemblePlpModel(ensembleResults$model, dirPath = file.path(getwd(),'model')) -ensembleModel <- loadEnsemblePlpModel(getwd(),'model') -``` - -# Apply Ensemble model -```{r eval=FALSE} -plpData <- loadPlpData("") -populationSettings <- ensembleModel$populationSettings -populationSettings$plpData <- plpData -population <- do.call(createStudyPopulation, populationSettings) -``` -Load the model. -```{r eval=FALSE} -ensembleModel <- loadEnsemblePlpModel("") -``` - -Get the predictions by applying the model: -```{r eval=FALSE} -prediction <- applyEnsembleModel(population, - dataList = list(plpData, plpData), - ensembleModel = ensembleModel)$prediction -``` - -# Demo - -We have added a demo of the ensemble training: - -``` {r eval=FALSE} -# Show all demos in our package: - demo(package = "PatientLevelPrediction") - -# Run the learning curve - demo("EnsembleModelDemo", package = "PatientLevelPrediction") -``` -# Acknowledgments - -Considerable work has been dedicated to provide the `PatientLevelPrediction` package. - -```{r tidy=TRUE,eval=TRUE} -citation("PatientLevelPrediction") -``` - -**Please reference this paper if you use the PLP Package in your work:** - -[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) diff --git a/vignettes/BuildingMultiplePredictiveModels.Rmd b/vignettes/BuildingMultiplePredictiveModels.Rmd index 8d7e218ad..d23c7326e 100644 --- a/vignettes/BuildingMultiplePredictiveModels.Rmd +++ b/vignettes/BuildingMultiplePredictiveModels.Rmd @@ -23,21 +23,24 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` # Introduction + In our [`paper`](https://academic.oup.com/jamia/article/25/8/969/4989437), we propose a standardised framework for patient-level prediction that utilizes the OMOP CDM and standardized vocabularies, and describe the open-source software that we developed implementing the framework’s pipeline. The framework is the first to enforce existing best practice guidelines and will enable open dissemination of models that can be extensively validated across the network of OHDSI collaborators. -One our best practices is that we see the selection of models and all study setting as an emperical question, i.e. we should use a data-driven approach in which we try many settings. This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to automatically build multiple patient-level predictive models, e.g. different population settings, covariate settings, and modelsetting. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). +One our best practices is that we see the selection of models and all study setting as an emperical question, i.e. we should use a data-driven approach in which we try many settings. This vignette describes how you can use the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to automatically build multiple patient-level predictive models, e.g. different population settings, covariate settings, and modelsetting. This vignette assumes you have read and are comfortable with building single patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). Note that it is also possible to generate a Study Package directly in Atlas that allows for multiple patient-level prediction analyses this is out-of-scope for this vignette. # Creating a model design -The first step is to specify each model you wish to develop by using the `createModelDesign` function. This function requires the following: +The first step is to specify each model you wish to develop by using the `createModelDesign` function. This function requires the following: ```{r echo = FALSE, results ='asis'} library(knitr) @@ -73,7 +76,8 @@ kable( ``` ## Model design example 1 -For example, if we wanted to predict the outcome (id 2) occuring for the first time within 180 days of the the target population index date (id 1). We are only interested in index dates betwrrn 2018-2020. Finally, we only want to use age, gender in 5 year buckets and conditions as features. The model can be specified by: + +For example, if we wanted to predict the outcome (id 2) occuring for the first time within 180 days of the the target population index date (id 1). We are only interested in index dates betwrrn 2018-2020. Finally, we only want to use age, gender in 5 year buckets and conditions as features. The model can be specified by: ```{r tidy=FALSE,eval=FALSE} @@ -111,6 +115,7 @@ modelDesign1 <- createModelDesign( covariateSettings = covariateSettings, featureEngineeringSettings = createFeatureEngineeringSettings(), sampleSettings = createSampleSettings(), + splitSettings = createDefaultSplitSetting(), preprocessSettings = createPreprocessSettings(), modelSettings = setLassoLogisticRegression() ) @@ -118,7 +123,8 @@ modelDesign1 <- createModelDesign( ``` ## Model design example 2 -For the second example, we want to predict the outcome (id 2) occuring for the first time within 730 days of the the target population index date (id 1). We want to train a random forest classifier. Finally, we want to use age, gender in 5 year buckets, drug ingredients (and groups) and conditions as features. The model can be specified by: + +For the second example, we want to predict the outcome (id 2) occuring for the first time within 730 days of the the target population index date (id 1). We want to train a random forest classifier. Finally, we want to use age, gender in 5 year buckets, drug ingredients (and groups) and conditions as features. The model can be specified by: ```{r tidy=FALSE,eval=FALSE} @@ -155,6 +161,7 @@ modelDesign2 <- createModelDesign( covariateSettings = covariateSettings, featureEngineeringSettings = createRandomForestFeatureSelection(ntrees = 500, maxDepth = 7), sampleSettings = createSampleSettings(), + splitSettings = createDefaultSplitSetting(), preprocessSettings = createPreprocessSettings(), modelSettings = setRandomForest() ) @@ -162,7 +169,8 @@ modelDesign2 <- createModelDesign( ``` ## Model design example 3 -For the third example, we want to predict the outcome (id 5) occuring during the cohort exposure of the the target population (id 1). We want to train a gradient boosting machine. Finally, we want to use age, gender in 5 year buckets and indications of measurements taken as features. The model can be specified by: + +For the third example, we want to predict the outcome (id 5) occuring during the cohort exposure of the the target population (id 1). We want to train a gradient boosting machine. Finally, we want to use age, gender in 5 year buckets and indications of measurements taken as features. The model can be specified by: ```{r tidy=FALSE,eval=FALSE} @@ -200,16 +208,16 @@ modelDesign3 <- createModelDesign( covariateSettings = covariateSettings, featureEngineeringSettings = createFeatureEngineeringSettings(), sampleSettings = createSampleSettings(), + splitSettings = createDefaultSplitSetting(), preprocessSettings = createPreprocessSettings(), modelSettings = setGradientBoostingMachine() ) ``` - # Running multiple models -As we will be downloading loads of data in the multiple plp analysis it is useful to set the Andromeda temp folder to a directory with write access and plenty of space. -`options(andromedaTempFolder = "c:/andromedaTemp")` + +As we will be downloading loads of data in the multiple plp analysis it is useful to set the Andromeda temp folder to a directory with write access and plenty of space. `options(andromedaTempFolder = "c:/andromedaTemp")` To run the study requires setting up a connectionDetails object @@ -261,17 +269,18 @@ results <- runMultiplePlp( modelDesign3 ), onlyFetchData = F, - splitSettings = createDefaultSplitSetting(), logSettings = createLogSettings(), saveDirectory = "./PlpMultiOutput" ) ``` -This will then save all the plpData objects from the study into "./PlpMultiOutput/plpData_T1_L" and the results into "./PlpMultiOutput/Analysis_". The csv file named settings.csv found in "./PlpMultiOutput" has a row for each prediction model developed and points to the plpData and settings used for the model development, it also has descriptions of the cohorts if these are input by the user. + +This will then save all the plpData objects from the study into "./PlpMultiOutput/plpData_T1_L" and the results into "./PlpMultiOutput/Analysis\_". The csv file named settings.csv found in "./PlpMultiOutput" has a row for each prediction model developed and points to the plpData and settings used for the model development, it also has descriptions of the cohorts if these are input by the user. Note that if for some reason the run is interrupted, e.g. because of an error, a new call to `runMultiplePlp` will continue and not restart until you remove the output folder. # Validating multiple models + If you have access to multiple databases on the same server in different schemas you could evaluate accross these using this call: ```{r tidy=FALSE,eval=FALSE} @@ -292,18 +301,22 @@ val <- validateMultiplePlp( valdiationDatabaseDetails = validationDatabaseDetails, validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), recalibrate = NULL, - saveDirectory = "./PlpMultiOutput/validation" + saveDirectory = "./PlpMultiOutput/Validation" ) ``` -This then saves the external validation results in the validation folder of the main study (the outputLocation you used in runPlpAnalyses). + +This then saves the external validation results in the `Validation` folder of the main study (the outputLocation you used in runPlpAnalyses). # Viewing the results + To view the results for the multiple prediction analysis: + ```{r tidy=FALSE,eval=FALSE} viewMultiplePlp(analysesLocation="./PlpMultiOutput") ``` -If the validation directory in "./PlpMultiOutput" has results, the external validation will also be displayed. + +If the validation directory in "./PlpMultiOutput" has a sqlite results database, the external validation will also be displayed. # Acknowledgments @@ -316,4 +329,3 @@ citation("PatientLevelPrediction") **Please reference this paper if you use the PLP Package in your work:** [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index 7249f6421..d805c3e1c 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -23,18 +23,19 @@ output: number_sections: yes toc: yes --- + +```{=html} - - +``` ```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE} library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" # Load all needed data if it exists on this computer: if (file.exists(vignetteDataFolder)){ - plpModel <- loadPlpModel(vignetteDataFolder,'model') + plpModel <- loadPlpModel(file.path(vignetteDataFolder,'model')) lrResults <- loadPlpModel(file.path(vignetteDataFolder,'results')) } ``` @@ -53,16 +54,13 @@ In our [`paper`](https://academic.oup.com/jamia/article/25/8/969/4989437), we pr Figure 1, illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time. -![The prediction problem](Figure1.png) - - - -As shown in Figure 2, to define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented in Figure 3. +![The prediction problem](Figure1.webp) -![Design choices](studydesign.png) +As shown in Figure 2, to define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented in Figure 3. +![Design choices](studydesign.webp) -![Examples of prediction problems](problems.png) +![Examples of prediction problems](problems.webp) This vignette describes how you can use the `PatientLevelPrediction` package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the [`InstallationGuide`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/InstallationGuide.pdf). @@ -72,43 +70,41 @@ We have to clearly specify our study upfront to be able to implement it. This me ## Problem definition 1: Stroke in afibrilation patients -Atrial fibrillation is a disease characterized by an irregular heart rate that can cause poor blood flow. Patients with atrial fibrillation are at increased risk of ischemic stroke. Anticoagulation is a recommended prophylaxis treatment strategy for patients at high risk of stroke, though the underuse of anticoagulants and persistent severity of ischemic stroke represents a substantial unmet medical need. Various strategies have been developed to predict risk of ischemic stroke in patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed as a risk score based on history of congestive heart failure, hypertension, age>=75, diabetes and stroke. CHADS2 was initially derived using Medicare claims data, where it achieved good discrimination (AUC=0.82). However, subsequent external validation studies revealed the CHADS2 had substantially lower predictive accuracy (Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have been developed and evaluated, including the extension of CHADS2Vasc. The management of atrial fibrillation has evolved substantially over the last decade, for various reasons that include the introduction of novel oral anticoagulants. With these innovations has come a renewed interest in greater precision medicine for stroke prevention. +Atrial fibrillation is a disease characterized by an irregular heart rate that can cause poor blood flow. Patients with atrial fibrillation are at increased risk of ischemic stroke. Anticoagulation is a recommended prophylaxis treatment strategy for patients at high risk of stroke, though the underuse of anticoagulants and persistent severity of ischemic stroke represents a substantial unmet medical need. Various strategies have been developed to predict risk of ischemic stroke in patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed as a risk score based on history of congestive heart failure, hypertension, age\>=75, diabetes and stroke. CHADS2 was initially derived using Medicare claims data, where it achieved good discrimination (AUC=0.82). However, subsequent external validation studies revealed the CHADS2 had substantially lower predictive accuracy (Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have been developed and evaluated, including the extension of CHADS2Vasc. The management of atrial fibrillation has evolved substantially over the last decade, for various reasons that include the introduction of novel oral anticoagulants. With these innovations has come a renewed interest in greater precision medicine for stroke prevention. We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question: - -Amongst patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year? - -We will define 'patients who are newly diagnosed with Atrial Fibrillation' as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'Ischemic stroke events' as ischemic stroke condition records during an inpatient or ER visit; successive records with > 180 day gap are considered independent episodes. + +Amongst patients who are newly diagnosed with Atrial Fibrillation, which patients will go on to have Ischemic Stroke within 1 year? + +We will define 'patients who are newly diagnosed with Atrial Fibrillation' as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'Ischemic stroke events' as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. ## Problem definition 2: Angioedema in ACE inhibitor users -Angiotensin converting enzyme inhibitors (ACE inhibitors) are medications used by patients with hypertension that widen the blood vessles and therefore increse the amount of blood pumped by the heart and decreases blood pressure. Ace inhibitors reduce a patients risk of cardiovasular disease but can lead to drug-induced angioedema. +Angiotensin converting enzyme inhibitors (ACE inhibitors) are medications used by patients with hypertension that widen the blood vessles and therefore increse the amount of blood pumped by the heart and decreases blood pressure. Ace inhibitors reduce a patients risk of cardiovasular disease but can lead to drug-induced angioedema. We will apply the PatientLevelPrediction package to observational healthcare data to address the following patient-level prediction question: - -Amongst patients who are newly dispensed an ACE inhibitor, which patients will go on to have angioedema within 1 year? - -We will define 'patients who are newly dispensed an ACE inhibitor' as the first drug record of sny ACE inhibitor, [...]which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'angioedema' as an angioedema condition record. +Amongst patients who are newly dispensed an ACE inhibitor, which patients will go on to have angioedema within 1 year? + +We will define 'patients who are newly dispensed an ACE inhibitor' as the first drug record of sny ACE inhibitor, [...]which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. We will define 'angioedema' as an angioedema condition record. ## Study population definition -The final study population in which we will develop our model is often a subset of the Target population, because we will e.g. apply criteria that are dependent on T and O or we want to do sensitivity analyses with subpopulations of T. For this we have to answer the following questions: - -- *What is the minimum amount of observation time we require before the start of the target cohort?* This choice could depend on the available patient time in your training data, but also on the time you expect to be available in the data sources you want to apply the model on in the future. The longer the minimum observation time, the more baseline history time is available for each person to use for feature extraction, but the fewer patients will qualify for analysis. Moreover, there could be clinical reasons to choose a short or longer lookback period. For our example, we will use a prior history as lookback period (washout period). +The final study population in which we will develop our model is often a subset of the Target population, because we will e.g. apply criteria that are dependent on T and O or we want to do sensitivity analyses with subpopulations of T. For this we have to answer the following questions: -- *Can patients enter the target cohort multiple times?* In the target cohort definition, a person may qualify for the cohort multiple times during different spans of time, for example if they had different episodes of a disease or separate periods of exposure to a medical product. The cohort definition does not necessarily apply a restriction to only let the patients enter once, but in the context of a particular patient-level prediction problem, a user may want to restrict the cohort to the first qualifying episode. In our example, a person could only enter the target cohort once since our criteria was based on first occurrence of atrial fibrillation. +- *What is the minimum amount of observation time we require before the start of the target cohort?* This choice could depend on the available patient time in your training data, but also on the time you expect to be available in the data sources you want to apply the model on in the future. The longer the minimum observation time, the more baseline history time is available for each person to use for feature extraction, but the fewer patients will qualify for analysis. Moreover, there could be clinical reasons to choose a short or longer lookback period. For our example, we will use a prior history as lookback period (washout period). -- *Do we allow persons to enter the cohort if they experienced the outcome before?* Do we allow persons to enter the target cohort if they experienced the outcome before qualifying for the target cohort? Depending on the particular patient-level prediction problem, there may be a desire to predict ‘incident’ first occurrence of an outcome, in which case patients who have previously experienced the outcome are not ‘at-risk’ for having a first occurrence and therefore should be excluded from the target cohort. In other circumstances, there may be a desire to predict ‘prevalent’ episodes, whereby patients with prior outcomes can be included in the analysis and the prior outcome itself can be a predictor of future outcomes. For our prediction example, the answer to this question is ‘Yes, allow persons with prior outcomes’ because we know from the CHADS2 score that prior strokes are very predictive of future strokes. If this answer would have been ‘No’ we also have to decide how long we would look back for previous occurrences of the outcome. +- *Can patients enter the target cohort multiple times?* In the target cohort definition, a person may qualify for the cohort multiple times during different spans of time, for example if they had different episodes of a disease or separate periods of exposure to a medical product. The cohort definition does not necessarily apply a restriction to only let the patients enter once, but in the context of a particular patient-level prediction problem, a user may want to restrict the cohort to the first qualifying episode. In our example, a person could only enter the target cohort once since our criteria was based on first occurrence of atrial fibrillation. -- *How do we define the period in which we will predict our outcome relative to the target cohort start?* We actually have to make two decisions to answer that question. First, does the time-at-risk window start at the date of the start of the target cohort or later? Arguments to make it start later could be that you want to avoid outcomes that were entered late in the record that actually occurred before the start of the target cohort or you want to leave a gap where interventions to prevent the outcome could theoretically be implemented. Second, you need to define the time-at-risk by setting the risk window end, as some specification of days offset relative to the target cohort start or end dates. For our problem we will predict in a ‘time-at-risk’ window starting 1 day after the start of the target cohort up to 365 days later (to look for 1-year risk following atrial fibrillation diagnosis). +- *Do we allow persons to enter the cohort if they experienced the outcome before?* Do we allow persons to enter the target cohort if they experienced the outcome before qualifying for the target cohort? Depending on the particular patient-level prediction problem, there may be a desire to predict ‘incident’ first occurrence of an outcome, in which case patients who have previously experienced the outcome are not ‘at-risk’ for having a first occurrence and therefore should be excluded from the target cohort. In other circumstances, there may be a desire to predict ‘prevalent’ episodes, whereby patients with prior outcomes can be included in the analysis and the prior outcome itself can be a predictor of future outcomes. For our prediction example, the answer to this question is ‘Yes, allow persons with prior outcomes’ because we know from the CHADS2 score that prior strokes are very predictive of future strokes. If this answer would have been ‘No’ we also have to decide how long we would look back for previous occurrences of the outcome. +- *How do we define the period in which we will predict our outcome relative to the target cohort start?* We actually have to make two decisions to answer that question. First, does the time-at-risk window start at the date of the start of the target cohort or later? Arguments to make it start later could be that you want to avoid outcomes that were entered late in the record that actually occurred before the start of the target cohort or you want to leave a gap where interventions to prevent the outcome could theoretically be implemented. Second, you need to define the time-at-risk by setting the risk window end, as some specification of days offset relative to the target cohort start or end dates. For our problem we will predict in a ‘time-at-risk’ window starting 1 day after the start of the target cohort up to 365 days later (to look for 1-year risk following atrial fibrillation diagnosis). -- *Do we require a minimum amount of time-at-risk?* We have to decide if we want to include patients that did not experience the outcome but did leave the database earlier than the end of our time-at-risk period. These patients may experience the outcome when we do not observe them. For our prediction problem we decide to answer this question with ‘Yes, require a mimimum time-at-risk’ for that reason. Furthermore, we have to decide if this constraint also applies to persons who experienced the outcome or we will include all persons with the outcome irrespective of their total time at risk. For example, if the outcome is death, then persons with the outcome are likely censored before the full time-at-risk period is complete. +- *Do we require a minimum amount of time-at-risk?* We have to decide if we want to include patients that did not experience the outcome but did leave the database earlier than the end of our time-at-risk period. These patients may experience the outcome when we do not observe them. For our prediction problem we decide to answer this question with ‘Yes, require a mimimum time-at-risk’ for that reason. Furthermore, we have to decide if this constraint also applies to persons who experienced the outcome or we will include all persons with the outcome irrespective of their total time at risk. For example, if the outcome is death, then persons with the outcome are likely censored before the full time-at-risk period is complete. ## Model development settings -To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the [`AddingCustomModels`](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomModels.pdf) vignette. +To develop the model we have to decide which algorithm(s) we like to train. We see the selection of the best algorithm for a certain prediction problem as an empirical question, i.e. you need to let the data speak for itself and try different approaches to find the best one. There is no algorithm that will work best for all problems (no free lunch). In our package we therefore aim to implement many algorithms. Furthermore, we made the system modular so you can add your own custom algorithms as described in more detail in the [`AddingCustomModels`](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomModels.pdf) vignette. Our package currently contains the following algorithms to choose from: @@ -132,81 +128,81 @@ cat(tabl) # output the table in a format good for HTML/PDF/docx conversion Furthermore, we have to decide on the **covariates** that we will use to train our model. This choice can be driven by domain knowledge of available computational resources. In our example, we like to add the Gender, Age, Conditions, Drugs Groups, and Visit Count. We also have to specify in which time windows we will look and we decide to look in year before and any time prior. Finally, we have to define how we will train and test our model on our data, i.e. how we perform **internal validation**. For this we have to decide how we divide our dataset in a training and testing dataset and how we randomly assign patients to these two sets. Dependent on the size of the training set we can decide how much data we like to use for training, typically this is a 75%, 25% split. If you have very large datasets you can use more data for training. To randomly assign patients to the training and testing set, there are two commonly used approaches: - -1. split by person. In this case a random seed is used to assign the patient to either sets. -2. split by time. In this case a time point is used to split the persons, e.g. 75% of the data is before and 25% is after this date. The advantage of this is that you take into consideration that the health care system has changed over time. + +1. split by person. In this case a random seed is used to assign the patient to either sets. +2. split by time. In this case a time point is used to split the persons, e.g. 75% of the data is before and 25% is after this date. The advantage of this is that you take into consideration that the health care system has changed over time. We now completely defined our studies and implement them: -- [See example 1: Stroke in afibrilation patients](#example1) -- [See example 2: Agioedema in ACE inhibitor new users](#example2) - +- [See example 1: Stroke in afibrilation patients](#example1) +- [See example 2: Agioedema in ACE inhibitor new users](#example2) + # Example 1: Stroke in afibrilation patients {#example1} ## Study Specification For our first prediction model we decide to start with a Regularized Logistic Regression and will use the default parameters. We will do a 75%-25% split by person. - -Definition | Value --------------------------------| -------------------------------------- | -**Problem Definition** | | -Target Cohort (T) | 'Patients who are newly diagnosed with Atrial Fibrillation' defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. | -Outcome Cohort (O) | 'Ischemic stroke events' defined as ischemic stroke condition records during an inpatient or ER visit; successive records with > 180 day gap are considered independent episodes. | -Time-at-risk (TAR) | 1 day till 365 days from cohort start| -| | -**Population Definition** | | -Washout Period | 1095 | -Enter the target cohort multiple times? | No| -Allow prior outcomes? | Yes | -Start of time-at-risk | 1 day | -End of time-at-risk | 365 days | -Require a minimum amount of time-at-risk? | Yes (364 days) -| | -**Model Development** | | -Algorithm | Regularized Logistic Regression | -Hyper-parameters | variance = 0.01 (Default) -Covariates | Gender, Age, Conditions (ever before, <365), Drugs Groups (ever before, <365), and Visit Count -Data split | 75% train, 25% test. Randomly assigned by person - + +| Definition | Value | +|-----------------|-------------------------------------------------------| +| **Problem Definition** | | +| Target Cohort (T) | 'Patients who are newly diagnosed with Atrial Fibrillation' defined as the first condition record of cardiac arrhythmia, which is followed by another cardiac arrhythmia condition record, at least two drug records for a drug used to treat arrhythmias, or a procedure to treat arrhythmias. | +| Outcome Cohort (O) | 'Ischemic stroke events' defined as ischemic stroke condition records during an inpatient or ER visit; successive records with \> 180 day gap are considered independent episodes. | +| Time-at-risk (TAR) | 1 day till 365 days from cohort start | +| | | +| **Population Definition** | | +| Washout Period | 1095 | +| Enter the target cohort multiple times? | No | +| Allow prior outcomes? | Yes | +| Start of time-at-risk | 1 day | +| End of time-at-risk | 365 days | +| Require a minimum amount of time-at-risk? | Yes (364 days) | +| | | +| **Model Development** | | +| Algorithm | Regularized Logistic Regression | +| Hyper-parameters | variance = 0.01 (Default) | +| Covariates | Gender, Age, Conditions (ever before, \<365), Drugs Groups (ever before, \<365), and Visit Count | +| Data split | 75% train, 25% test. Randomly assigned by person | + According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later. - + ## Study implementation - + Now we have completely design our study we have to implement the study. We have to generate the target and outcome cohorts and we need to develop the R code to run against our CDM that will execute the full study. - + ### Cohort instantiation - + For our study we need to know when a person enters the target and outcome cohorts. This is stored in a table on the server that contains the cohort start date and cohort end date for all subjects for a specific cohort definition. This cohort table has a very simple structure as shown below: - -- `cohort_definition_id`, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts. -- `subject_id`, a unique identifier corresponding to the `person_id` in the CDM. -- `cohort_start_date`, the date the subject enters the cohort. -- `cohort_end_date`, the date the subject leaves the cohort. - -How do we fill this table according to our cohort definitions? There are two options for this: - -1) use the interactive cohort builder tool in [ATLAS](www.github.com/OHDSI/ATLAS) which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table. - -2) write your own custom SQL statements to fill the cohort table. + +- `cohort_definition_id`, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts. +- `subject_id`, a unique identifier corresponding to the `person_id` in the CDM. +- `cohort_start_date`, the date the subject enters the cohort. +- `cohort_end_date`, the date the subject leaves the cohort. + +How do we fill this table according to our cohort definitions? There are two options for this: + +1) use the interactive cohort builder tool in [ATLAS](www.github.com/OHDSI/ATLAS) which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table. + +2) write your own custom SQL statements to fill the cohort table. Both methods are described below for our example prediction problem. ### ATLAS cohort builder -![Target Cohort Atrial Fibrillation](example1/ATLAS_T.png) +![Target Cohort Atrial Fibrillation](example1/ATLAS_T.webp) -ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS. +ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS. -![Outcome Cohort Stroke](example1/ATLAS_O.png) +![Outcome Cohort Stroke](example1/ATLAS_O.webp) -The T and O cohorts can be found here: +The T and O cohorts can be found here: -- Atrial Fibrillaton (T): http://www.ohdsi.org/web/atlas/#/cohortdefinition/1769447 -- Stroke (O) : http://www.ohdsi.org/web/atlas/#/cohortdefinition/1769448 +- Atrial Fibrillaton (T): +- Stroke (O) : -In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). +In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). -Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4. +Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1769447 in Figure 4. ### Custom cohorts @@ -313,10 +309,11 @@ FROM ; ``` + This is parameterized SQL which can be used by the [`SqlRender`](http://github.com/OHDSI/SqlRender) package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in `SqlRender`, we can make sure the SQL code can be run in many different environments. - + To execute this sql against our CDM we first need to tell R how to connect to the server. `PatientLevelPrediction` uses the [`DatabaseConnector`](http://github.com/ohdsi/DatabaseConnector) package, which provides a function called `createConnectionDetails`. Type `?createConnectionDetails` for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code: - + ```{r tidy=FALSE,eval=FALSE} connectionDetails <- createConnectionDetails(dbms = "postgresql", server = "localhost/ohdsi", @@ -327,10 +324,9 @@ To execute this sql against our CDM we first need to tell R how to connect to th cohortsDatabaseSchema <- "my_results" cdmVersion <- "5" ``` - + The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example `cdmDatabaseSchema <- "my_cdm_data.dbo"`. - - + ```{r tidy=FALSE,eval=FALSE} library(SqlRender) sql <- readSql("AfStrokeCohorts.sql") @@ -344,11 +340,11 @@ The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` connection <- connect(connectionDetails) executeSql(connection, sql) ``` - + In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the `connectionDetails`. Next, we connect to the server, and submit the rendered and translated SQL. - + If all went well, we now have a table with the events of interest. We can see how many events per type: - + ```{r tidy=FALSE,eval=FALSE} sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", "FROM @cohortsDatabaseSchema.AFibStrokeCohort", @@ -357,20 +353,21 @@ If all went well, we now have a table with the events of interest. We can see ho sql <- translateSql(sql, targetDialect = connectionDetails$dbms)$sql querySql(connection, sql) - ``` - ```{r echo=FALSE,message=FALSE} - data.frame(cohort_definition_id = c(1, 2),count = c(527616, 221555)) ``` - + +```{r echo=FALSE,message=FALSE} +data.frame(cohort_definition_id = c(1, 2),count = c(527616, 221555)) +``` + ### Study script creation - -In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier. - + +In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier. + ### Data extraction - -Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtration). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtration). For our example study we decided to use these settings: - -```{r tidy=FALSE,eval=FALSE} + +Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtraction). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: + +```{r tidy=FALSE,eval=FALSE} covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, useDemographicsAge = TRUE, useConditionGroupEraLongTerm = TRUE, @@ -381,9 +378,9 @@ Now we can tell `PatientLevelPrediction` to extract all necessary data for our a longTermStartDays = -365, endDays = -1) ``` - -The final step for extracting the data is to run the `getPlpData` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. - + +The final step for extracting the data is to run the `getPlpData` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. + ```{r tidy=FALSE,eval=FALSE} databaseDetails <- createDatabaseDetails( @@ -410,26 +407,25 @@ restrictPlpDataSettings <- createRestrictPlpDataSettings(sampleSize = 10000) restrictPlpDataSettings = restrictPlpDataSettings ) ``` - -Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `createRestrictPlpDataSettings` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `Andromeda` (which uses [SQLite](https://www.sqlite.org/index.html)) to store information in a way that ensures R does not run out of memory, even when the data are large. - + +Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `createRestrictPlpDataSettings` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `Andromeda` (which uses [SQLite](https://www.sqlite.org/index.html)) to store information in a way that ensures R does not run out of memory, even when the data are large. + Creating the `plpData` object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because `plpData` uses `Andromeda`, we cannot use R's regular save function. Instead, we'll have to use the `savePlpData()` function: - + ```{r tidy=TRUE,eval=FALSE} savePlpData(plpData, "stroke_in_af_data") ``` - + We can use the `loadPlpData()` function to load the data in a future session. - - + ### Additional inclusion criteria - + To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (`requireTimeAtRisk, minTimeAtRisk`) and we can specify if this also applies to patients with the outcome (`includeAllOutcomes`). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set `riskWindowStart = 30` and `riskWindowEnd = 365`. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting `addExposureToStart = TRUE` which adds the cohort (exposure) time to the start date. - + In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population. - + In the example below all the settings we defined for our study are imposed: - + ```{r tidy=FALSE,eval=FALSE} populationSettings <- createStudyPopulationSettings( washoutPeriod = 1095, @@ -446,14 +442,13 @@ In the example below all the settings we defined for our study are imposed: ) ``` - ### Spliting the data into training/validation/testing datasets -When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). +When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). -In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add link). +In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add%20link). -In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: +In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: ```{r tidy=FALSE,eval=FALSE} splitSettings <- createDefaultSplitSetting( @@ -469,20 +464,17 @@ Note: it is possible to add a custom method to specify how the plpData are parti ### Preprocessing the training data -There a numerous data processing settings that a user must specify when developing a prediction model. These are: - * Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) - * Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) - * Whether to remove redundant features and normalize the data (this is required for some models) - -The default sample settings does nothing, it simply returns the trainData as input, see below: +There a numerous data processing settings that a user must specify when developing a prediction model. These are: \* Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) \* Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) \* Whether to remove redundant features and normalize the data (this is required for some models) + +The default sample settings does nothing, it simply returns the trainData as input, see below: ```{r tidy=FALSE,eval=FALSE} sampleSettings <- createSampleSettings() ``` -However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf). +However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf). -It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: +It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: ```{r tidy=FALSE,eval=FALSE} featureEngineeringSettings <- createFeatureEngineeringSettings() @@ -490,7 +482,7 @@ It is possible to specify a combination of feature engineering functions that ta However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomFeatureEngineering.pdf). -Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. +Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. ```{r tidy=FALSE,eval=FALSE} preprocessSettingsSettings <- createPreprocessSettings( @@ -501,17 +493,17 @@ Finally, the preprocessing setting is required. For this setting the user can d ``` ### Model Development - + In the set function of an algorithm the user can specify a list of eligible values for each hyper-parameter. All possible combinations of the hyper-parameters are included in a so-called grid search using cross-validation on the training set. If a user does not specify any value then the default value is used instead. - -For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters - + +For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters + ```{r tidy=TRUE,eval=FALSE} lrModel <- setLassoLogisticRegression() ``` - -The `runPlP` function requires the `plpData`, the `outcomeId` specifying the outcome being predicted and the settings: `populationSettings`, `splitSettings`, `sampleSettings`, `featureEngineeringSettings`, `preprocessSettings` and `modelSettings` to train and evaluate the model. - + +The `runPlP` function requires the `plpData`, the `outcomeId` specifying the outcome being predicted and the settings: `populationSettings`, `splitSettings`, `sampleSettings`, `featureEngineeringSettings`, `preprocessSettings` and `modelSettings` to train and evaluate the model. + ```{r tidy=FALSE,eval=FALSE} lrResults <- runPlp( plpData = plpData, @@ -536,104 +528,106 @@ The `runPlP` function requires the `plpData`, the `outcomeId` specifying the out saveDirectory = file.path(getwd(), 'singlePlp') ) ``` -Under the hood the package will now use the [`Cyclops`](www.github.com/OHDSI/Cyclops) package to fit a large-scale regularized regression using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc. - + +Under the hood the package will now use the [`Cyclops`](www.github.com/OHDSI/Cyclops) package to fit a large-scale regularized regression using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc. + You can save the model using: - + ```{r tidy=TRUE,eval=FALSE} savePlpModel(lrResults$model, dirPath = file.path(getwd(), "model")) ``` - + You can load the model using: - + ```{r tidy=TRUE,eval=FALSE} plpModel <- loadPlpModel(file.path(getwd(),'model')) ``` - + You can also save the full results structure using: - + ```{r tidy=TRUE,eval=FALSE} savePlpResult(lrResults, location = file.path(getwd(),'lr')) ``` - + To load the full results structure use: - + ```{r tidy=TRUE,eval=FALSE} lrResults <- loadPlpResult(file.path(getwd(),'lr')) ``` \newpage + # Example 2: Angioedema in ACE inhibitor users {#example2} ## Study Specification - - Definition | Value - -------------------------------| -------------------------------------- | - **Problem Definition** | | - Target Cohort (T) | 'Patients who are newly dispensed an ACE inhibitor' defined as the first drug record of any ACE inhibitor | - Outcome Cohort (O) | 'Angioedema' defined as an angioedema condition record during an inpatient or ER visit| - Time-at-risk (TAR) | 1 day till 365 days from cohort start| - | | - **Population Definition** | | - Washout Period | 365 | - Enter the target cohort multiple times? | No| - Allow prior outcomes? | No | - Start of time-at-risk | 1 day | - End of time-at-risk | 365 days | - Require a minimum amount of time-at-risk? | Yes (364 days) - | | - **Model Development** | | - Algorithm | Gradient Boosting Machine | - Hyper-parameters | ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or 0.1 or 0.9 - Covariates | Gender, Age, Conditions (ever before, <365), Drugs Groups (ever before, <365), and Visit Count - Data split | 75% train, 25% test. Randomly assigned by person - - According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later. - + +| Definition | Value | +|----------------------|--------------------------------------------------| +| **Problem Definition** | | +| Target Cohort (T) | 'Patients who are newly dispensed an ACE inhibitor' defined as the first drug record of any ACE inhibitor | +| Outcome Cohort (O) | 'Angioedema' defined as an angioedema condition record during an inpatient or ER visit | +| Time-at-risk (TAR) | 1 day till 365 days from cohort start | +| | | +| **Population Definition** | | +| Washout Period | 365 | +| Enter the target cohort multiple times? | No | +| Allow prior outcomes? | No | +| Start of time-at-risk | 1 day | +| End of time-at-risk | 365 days | +| Require a minimum amount of time-at-risk? | Yes (364 days) | +| | | +| **Model Development** | | +| Algorithm | Gradient Boosting Machine | +| Hyper-parameters | ntree:5000, max depth:4 or 7 or 10 and learning rate: 0.001 or 0.01 or 0.1 or 0.9 | +| Covariates | Gender, Age, Conditions (ever before, \<365), Drugs Groups (ever before, \<365), and Visit Count | +| Data split | 75% train, 25% test. Randomly assigned by person | + +According to the best practices we need to make a protocol that completely specifies how we plan to execute our study. This protocol will be assessed by the governance boards of the participating data sources in your network study. For this a template could be used but we prefer to automate this process as much as possible by adding functionality to automatically generate study protocol from a study specification. We will discuss this in more detail later. + ## Study implementation - + Now we have completely design our study we have to implement the study. We have to generate the target and outcome cohorts and we need to develop the R code to run against our CDM that will execute the full study. - + ### Cohort instantiation - + For our study we need to know when a person enters the target and outcome cohorts. This is stored in a table on the server that contains the cohort start date and cohort end date for all subjects for a specific cohort definition. This cohort table has a very simple structure as shown below: - -- `cohort_definition_id`, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts. -- `subject_id`, a unique identifier corresponding to the `person_id` in the CDM. -- `cohort_start_date`, the date the subject enters the cohort. -- `cohort_end_date`, the date the subject leaves the cohort. - -How do we fill this table according to our cohort definitions? There are two options for this: - -1) use the interactive cohort builder tool in [ATLAS](www.github.com/OHDSI/ATLAS) which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table. - -2) write your own custom SQL statements to fill the cohort table. - + +- `cohort_definition_id`, a unique identifier for distinguishing between different types of cohorts, e.g. cohorts of interest and outcome cohorts. +- `subject_id`, a unique identifier corresponding to the `person_id` in the CDM. +- `cohort_start_date`, the date the subject enters the cohort. +- `cohort_end_date`, the date the subject leaves the cohort. + +How do we fill this table according to our cohort definitions? There are two options for this: + +1) use the interactive cohort builder tool in [ATLAS](www.github.com/OHDSI/ATLAS) which can be used to create cohorts based on inclusion criteria and will automatically populate this cohort table. + +2) write your own custom SQL statements to fill the cohort table. + Both methods are described below for our example prediction problem. - + ### ATLAS cohort builder - -![Target Cohort ACE inhibitors](example2/aceinhibitors.png) - -ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 6 shows how we created the ACE inhibitors cohort and Figure 7 shows how we created the angioedema cohort in ATLAS. - -![Outcome Cohort Angioedema](example2/angioedema.png) - -The T and O cohorts can be found here: - -- Ace inhibitors (T): http://www.ohdsi.org/web/atlas/#/cohortdefinition/1770617 -- Angioedema (O) : http://www.ohdsi.org/web/atlas/#/cohortdefinition/1770616 - -In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). - -Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1770617 in Figure 6. - + +![Target Cohort ACE inhibitors](example2/aceinhibitors.webp) + +ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 6 shows how we created the ACE inhibitors cohort and Figure 7 shows how we created the angioedema cohort in ATLAS. + +![Outcome Cohort Angioedema](example2/angioedema.webp) + +The T and O cohorts can be found here: + +- Ace inhibitors (T): +- Angioedema (O) : + +In depth explanation of cohort creation in ATLAS is out of scope of this vignette but can be found on the OHDSI wiki pages [(link)](http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas). + +Note that when a cohort is created in ATLAS the cohortid is needed to extract the data in R. The cohortid can be found at the top of the ATLAS screen, e.g. 1770617 in Figure 6. + ### Custom cohorts - + It is also possible to create cohorts without the use of ATLAS. Using custom cohort code (SQL) you can make more advanced cohorts if needed. - + For our example study, we need to create at table to hold the cohort data and we need to create SQL code to instantiate this table for both the AF and Stroke cohorts. Therefore, we create a file called *AceAngioCohorts.sql* with the following contents: - + ```{sql, eval=FALSE} /*********************************** File AceAngioCohorts.sql @@ -711,9 +705,9 @@ For our example study, we need to create at table to hold the cohort data and we ``` This is parameterized SQL which can be used by the [`SqlRender`](http://github.com/OHDSI/SqlRender) package. We use parameterized SQL so we do not have to pre-specify the names of the CDM and result schemas. That way, if we want to run the SQL on a different schema, we only need to change the parameter values; we do not have to change the SQL code. By also making use of translation functionality in `SqlRender`, we can make sure the SQL code can be run in many different environments. - + To execute this sql against our CDM we first need to tell R how to connect to the server. `PatientLevelPrediction` uses the [`DatabaseConnector`](http://github.com/ohdsi/DatabaseConnector) package, which provides a function called `createConnectionDetails`. Type `?createConnectionDetails` for the specific settings required for the various database management systems (DBMS). For example, one might connect to a PostgreSQL database using this code: - + ```{r tidy=FALSE,eval=FALSE} connectionDetails <- createConnectionDetails(dbms = "postgresql", server = "localhost/ohdsi", @@ -724,10 +718,9 @@ To execute this sql against our CDM we first need to tell R how to connect to th cohortsDatabaseSchema <- "my_results" cdmVersion <- "5" ``` - + The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` variables, as well as the CDM version. We will use these later to tell R where the data in CDM format live, where we want to create the cohorts of interest, and what version CDM is used. Note that for Microsoft SQL Server, databaseschemas need to specify both the database and the schema, so for example `cdmDatabaseSchema <- "my_cdm_data.dbo"`. - - + ```{r tidy=FALSE,eval=FALSE} library(SqlRender) sql <- readSql("AceAngioCohorts.sql") @@ -739,11 +732,11 @@ The last three lines define the `cdmDatabaseSchema` and `cohortsDatabaseSchema` connection <- connect(connectionDetails) executeSql(connection, sql) ``` - + In this code, we first read the SQL from the file into memory. In the next line, we replace four parameter names with the actual values. We then translate the SQL into the dialect appropriate for the DBMS we already specified in the `connectionDetails`. Next, we connect to the server, and submit the rendered and translated SQL. - + If all went well, we now have a table with the events of interest. We can see how many events per type: - + ```{r tidy=FALSE,eval=FALSE} sql <- paste("SELECT cohort_definition_id, COUNT(*) AS count", "FROM @cohortsDatabaseSchema.AceAngioCohort", @@ -753,19 +746,20 @@ If all went well, we now have a table with the events of interest. We can see ho querySql(connection, sql) ``` + ```{r echo=FALSE,message=FALSE} data.frame(cohort_definition_id = c(1, 2),count = c(0, 0)) ``` - + ### Study script creation - -In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier. - + +In this section we assume that our cohorts have been created either by using ATLAS or a custom SQL script. We will first explain how to create an R script yourself that will execute our study as we have defined earlier. + ### Data extraction - -Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtration). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtration). For our example study we decided to use these settings: - -```{r tidy=FALSE,eval=FALSE} + +Now we can tell `PatientLevelPrediction` to extract all necessary data for our analysis. This is done using the [`FeatureExtractionPackage`](https://github.com/OHDSI/FeatureExtraction). In short the FeatureExtractionPackage allows you to specify which features (covariates) need to be extracted, e.g. all conditions and drug exposures. It also supports the creation of custom covariates. For more detailed information on the FeatureExtraction package see its [vignettes](https://github.com/OHDSI/FeatureExtraction). For our example study we decided to use these settings: + +```{r tidy=FALSE,eval=FALSE} covariateSettings <- createCovariateSettings(useDemographicsGender = TRUE, useDemographicsAge = TRUE, useConditionGroupEraLongTerm = TRUE, @@ -776,9 +770,9 @@ Now we can tell `PatientLevelPrediction` to extract all necessary data for our a longTermStartDays = -365, endDays = -1) ``` - -The final step for extracting the data is to run the `getPlpData` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. - + +The final step for extracting the data is to run the `getPlpData` function and input the connection details, the database schema where the cohorts are stored, the cohort definition ids for the cohort and outcome, and the washoutPeriod which is the minimum number of days prior to cohort index date that the person must have been observed to be included into the data, and finally input the previously constructed covariate settings. + ```{r tidy=FALSE,eval=FALSE} databaseDetails <- createDatabaseDetails( @@ -803,25 +797,25 @@ plpData <- getPlpData( ) ``` - -Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `getPlpData` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `ff` to store information in a way that ensures R does not run out of memory, even when the data are large. - + +Note that if the cohorts are created in ATLAS its corresponding cohort database schema needs to be selected. There are many additional parameters for the `getPlpData` function which are all documented in the `PatientLevelPrediction` manual. The resulting `plpData` object uses the package `ff` to store information in a way that ensures R does not run out of memory, even when the data are large. + Creating the `plpData` object can take considerable computing time, and it is probably a good idea to save it for future sessions. Because `plpData` uses `ff`, we cannot use R's regular save function. Instead, we'll have to use the `savePlpData()` function: - + ```{r tidy=TRUE,eval=FALSE} savePlpData(plpData, "angio_in_ace_data") ``` - + We can use the `loadPlpData()` function to load the data in a future session. - + ### Additional inclusion criteria - + To completely define the prediction problem the final study population is obtained by applying additional constraints on the two earlier defined cohorts, e.g., a minumim time at risk can be enforced (`requireTimeAtRisk, minTimeAtRisk`) and we can specify if this also applies to patients with the outcome (`includeAllOutcomes`). Here we also specify the start and end of the risk window relative to target cohort start. For example, if we like the risk window to start 30 days after the at-risk cohort start and end a year later we can set `riskWindowStart = 30` and `riskWindowEnd = 365`. In some cases the risk window needs to start at the cohort end date. This can be achieved by setting `addExposureToStart = TRUE` which adds the cohort (exposure) time to the start date. - + In Appendix 1, we demonstrate the effect of these settings on the subset of the persons in the target cohort that end up in the final study population. - + In the example below all the settings we defined for our study are imposed: - + ```{r tidy=FALSE,eval=FALSE} populationSettings <- createStudyPopulationSettings( washoutPeriod = 364, @@ -840,11 +834,11 @@ In the example below all the settings we defined for our study are imposed: ### Spliting the data into training/validation/testing datasets -When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). +When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). -In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add link). +In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add%20link). -In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: +In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: ```{r tidy=FALSE,eval=FALSE} splitSettings <- createDefaultSplitSetting( @@ -856,24 +850,21 @@ In the PatientLevelPrediction package, the splitSettings define how the plpData ) ``` -Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf). +Note: it is possible to add a custom method to specify how the plpData are partitioned into training/validation/testing data, see [vignette for custom splitting](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf). ### Preprocessing the training data -There a numerous data processing settings that a user must specify when developing a prediction model. These are: - * Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) - * Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) - * Whether to remove redundant features and normalize the data (this is required for some models) - -The default sample settings does nothing, it simply returns the trainData as input, see below: +There a numerous data processing settings that a user must specify when developing a prediction model. These are: \* Whether to under-sample or over-sample the training data (this may be useful when there is class imballance (e.g., the outcome is very rare or very common)) \* Whether to perform feature engineering or feature selection (e.g., create latent variables that are not observed in the data or reduce the dimensionality of the data) \* Whether to remove redundant features and normalize the data (this is required for some models) + +The default sample settings does nothing, it simply returns the trainData as input, see below: ```{r tidy=FALSE,eval=FALSE} sampleSettings <- createSampleSettings() ``` -However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf). +However, the current package contains methods of under-sampling the non-outcome patients. To perform undersampling, the `type` input should be 'underSample' and `numberOutcomestoNonOutcomes` must be specified (an integer specifying the number of non-outcomes per outcome). It is possible to add any custom function for over/under sampling, see [vignette for custom sampling](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf). -It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: +It is possible to specify a combination of feature engineering functions that take as input the trainData and output a new trainData with different features. The default feature engineering setting does nothing: ```{r tidy=FALSE,eval=FALSE} featureEngineeringSettings <- createFeatureEngineeringSettings() @@ -881,7 +872,7 @@ It is possible to specify a combination of feature engineering functions that ta However, it is possible to add custom feature engineering functions into the pipeline, see [vignette for custom feature engineering](https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomfeatureEngineering.pdf). -Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. +Finally, the preprocessing setting is required. For this setting the user can define `minFraction`, this removes any features that is observed in the training data for less than 0.01 fraction of the patients. So, if `minFraction = 0.01` then any feature that is seen in less than 1 percent of the target population is removed. The input `normalize` specifies whether the features are scaled between 0 and 1, this is required for certain models (e.g., LASSO logistic regression). The input `removeRedundancy` specifies whether features that are observed in all of the target population are removed. ```{r tidy=FALSE,eval=FALSE} preprocessSettingsSettings <- createPreprocessSettings( @@ -892,11 +883,11 @@ Finally, the preprocessing setting is required. For this setting the user can d ``` ### Model Development - + In the set function of an algorithm the user can specify a list of eligible values for each hyper-parameter. All possible combinations of the hyper-parameters are included in a so-called grid search using cross-validation on the training set. If a user does not specify any value then the default value is used instead. - -For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters - + +For example, if we use the following settings for the gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search will apply the gradient boosting machine algorithm with ntrees=100 and maxDepth=4 plus the default settings for other hyper-parameters and ntrees=200 and maxDepth=4 plus the default settings for other hyper-parameters. The hyper-parameters that lead to the bestcross-validation performance will then be chosen for the final model. For our problem we choose to build a logistic regression model with the default hyper-parameters + ```{r tidy=TRUE,eval=FALSE} gbmModel <- setGradientBoostingMachine( ntrees = 5000, @@ -904,9 +895,9 @@ For example, if we use the following settings for the gradientBoostingMachine: n learnRate = c(0.001,0.01,0.1,0.9) ) ``` - -The `runPlP` function requires the `plpData`, the `outcomeId` specifying the outcome being predicted and the settings: `populationSettings`, `splitSettings`, `sampleSettings`, `featureEngineeringSettings`, `preprocessSettings` and `modelSettings` to train and evaluate the model. - + +The `runPlP` function requires the `plpData`, the `outcomeId` specifying the outcome being predicted and the settings: `populationSettings`, `splitSettings`, `sampleSettings`, `featureEngineeringSettings`, `preprocessSettings` and `modelSettings` to train and evaluate the model. + ```{r tidy=FALSE,eval=FALSE} gbmResults <- runPlp( plpData = plpData, @@ -932,79 +923,103 @@ The `runPlP` function requires the `plpData`, the `outcomeId` specifying the out ) ``` -Under the hood the package will now use the R xgboost package to fit a a gradient boosting machine model using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc. - +Under the hood the package will now use the R xgboost package to fit a a gradient boosting machine model using 75% of the data and will evaluate the model on the remaining 25%. A results data structure is returned containing information about the model, its performance etc. + You can save the model using: - + ```{r tidy=TRUE,eval=FALSE} savePlpModel(gbmResults$model, dirPath = file.path(getwd(), "model")) ``` - + You can load the model using: - + ```{r tidy=TRUE,eval=FALSE} plpModel <- loadPlpModel(file.path(getwd(),'model')) ``` - + You can also save the full results structure using: - + ```{r tidy=TRUE,eval=FALSE} savePlpResult(gbmResults, location = file.path(getwd(),'gbm')) ``` - + To load the full results structure use: - + ```{r tidy=TRUE,eval=FALSE} gbmResults <- loadPlpResult(file.path(getwd(),'gbm')) ``` - - - - + \newpage -# Study package creation - + +# Study package creation + The script we created manually above can also be automatically created using a powerful feature in ATLAS. By creating a new prediction study (left menu) you can select the Target and Outcome as created in ATLAS, set all the study parameters, and then you can download a R package that you can use to execute your study. What is really powerful is that you can add multiple Ts, Os, covariate settings etc. The package will then run all the combinations of automatically as separate analyses. The screenshots below explain this process. - + 1)
+
Create a new prediction study and select your target and outcome cohorts.
-
![](atlasplp1.png)
+ +
![](atlasplp1.webp)
+
- + 2)
-
Specify one or more analysis settings.
-
![](atlasplp2.png)
-
- \newpage + +
Specify one or more analysis settings.
+ +
![](atlasplp2.web)
+ + + + \newpage + 3)
-
Specify the trainings settigns
-
![](atlasplp3.png)
-
- + +
Specify the trainings settigns
+ +
![](atlasplp3.webp)
+ + + 4)
-
Specify the execution settings
-
![](atlasplp4.png)
-
- + +
Specify the execution settings
+ +
![](atlasplp4.web)
+ + +] \newpage -ATLAS can build a R package for you that will execute the full study against you CDM. -Below the steps are explained how to do this in ATLAS. -1)
-
Under utilities you can find download. Click on the button to review the full study specification
-
![R package download functionality in ATLAS](atlasdownload1.png)
-
- -2)
-
You now have to review that you indeed want to run all these analyses (cartesian product of all the settings for each T and O combination.
-
![R package download functionality in ATLAS](atlasdownload2.png)
-
- -3) If you agree, you give the package a name, and download the package as a zipfile. +ATLAS can build a R package for you that will execute the full study against you CDM. Below the steps are explained how to do this in ATLAS. + +1)
+ +
Under utilities you can find download. Click on the button to review the full study specification
-4) By opening the R package in R studio and building the package you can run the study using the `execute` function. Theres is also an example CodeToRun.R script available in the extras folder of the package with extra instructions. +
+ + ![R package download functionality in ATLAS](atlasdownload1.webp) + +
+ +
+ +2)
+ +
You now have to review that you indeed want to run all these analyses (cartesian product of all the settings for each T and O combination.
+ +
+ + ![R package download functionality in ATLAS](atlasdownload2.webp) + +
+ +
+3) If you agree, you give the package a name, and download the package as a zipfile. +4) By opening the R package in R studio and building the package you can run the study using the `execute` function. Theres is also an example CodeToRun.R script available in the extras folder of the package with extra instructions. # Internal validation @@ -1012,39 +1027,60 @@ Once we execute the study, the runPlp() function returns the trained model and t You can interactively view the results by running: `viewPlp(runPlp=lrResults)`. This will generate a Shiny App in your browser in which you can view all performance measures created by the framework as shown in the figure below. -
![Summary of all the performance measures of the analyses](shinysummary.png)
- - Furthermore, many interactive plots are available in the Shiny App, for example the ROC curve in which you can move over the plot to see the threshold and the corresponding sensitivity and specificity values. -
![Example of the interactive ROC curve](shinyroc.png)
- - To generate and save all the evaluation plots to a folder run the following code: - +
+ +![Summary of all the performance measures of the analyses](shinysummary.webp) + +
+ +Furthermore, many interactive plots are available in the Shiny App, for example the ROC curve in which you can move over the plot to see the threshold and the corresponding sensitivity and specificity values. + +
+ +![Example of the interactive ROC curve](shinyroc.webp) + +
+ +To generate and save all the evaluation plots to a folder run the following code: + ```{r tidy=TRUE,eval=FALSE} plotPlp(lrResults, dirPath=getwd()) ``` + The plots are described in more detail in the next sections. \newpage + ## Discrimination -The Receiver Operating Characteristics (ROC) plot shows the sensitivity against 1-specificity on the test set. The plot illustrates how well the model is able to discriminate between the people with the outcome and those without. The dashed diagonal line is the performance of a model that randomly assigns predictions. The higher the area under the ROC plot the better the discrimination of the model. The plot is created by changing the probability threshold to assign the positive class. +The Receiver Operating Characteristics (ROC) plot shows the sensitivity against 1-specificity on the test set. The plot illustrates how well the model is able to discriminate between the people with the outcome and those without. The dashed diagonal line is the performance of a model that randomly assigns predictions. The higher the area under the ROC plot the better the discrimination of the model. The plot is created by changing the probability threshold to assign the positive class. -
![Receiver Operating Characteristic Plot](sparseRoc.png)
- - \newpage -## Calibration +
+ +![Receiver Operating Characteristic Plot](sparseRoc.webp) + +
+ +\newpage + +\## Calibration The calibration plot shows how close the predicted risk is to the observed risk. The diagonal dashed line thus indicates a perfectly calibrated model. The ten (or fewer) dots represent the mean predicted values for each quantile plotted against the observed fraction of people in that quantile who had the outcome (observed fraction). The straight black line is the linear regression using these 10 plotted quantile mean predicted vs observed fraction points. The straight vertical lines represented the 95% lower and upper confidence intervals of the slope of the fitted line. -
![Calibration Plot](sparseCalibration.png)
- +
+ +![Calibration Plot](sparseCalibration.webp) + +
+ \newpage + ## Smooth Calibration Similar to the traditional calibration shown above the Smooth Calibration plot shows the relationship between predicted and observed risk. the major difference is that the smooth fit allows for a more fine grained examination of this. Whereas the traditional plot will be heavily influenced by the areas with the highest density of data the smooth plot will provide the same information for this region as well as a more accurate interpretation of areas with lower density. the plot also contains information on the distribution of the outcomes relative to predicted risk. However, the increased information gain comes at a computational cost. It is recommended to use the traditional plot for examination and then to produce the smooth plot for final versions. To create the smooth calibarion plot you have to run the follow command: - + ```{r tidy=TRUE,eval=FALSE} plotSmoothCalibration(lrResults) ``` @@ -1053,76 +1089,115 @@ See the help function for more information, on how to set the smoothing method e The example below is from another study that better demonstrates the impact of using a smooth calibration plot. The default line fit would not highlight the miss-calibration at the lower predicted probability levels that well. -
![Smooth Calibration plot](smoothCalibration.jpeg)
- - \newpage -## Preference distribution +
+ +![Smooth Calibration plot](smoothCalibration.jpeg) + +
+ +\newpage + +\## Preference distribution The preference distribution plots are the preference score distributions corresponding to i) people in the test set with the outcome (red) and ii) people in the test set without the outcome (blue). -
![Preference Plot](preferencePDF.png)
- - \newpage -## Predicted probability distribution +
+ +![Preference Plot](preferencePDF.webp) + +
+ +\newpage + +\## Predicted probability distribution -The prediction distribution box plots are for the predicted risks of the people in the test set with the outcome (class 1: blue) and without the outcome (class 0: red). +The prediction distribution box plots are for the predicted risks of the people in the test set with the outcome (class 1: blue) and without the outcome (class 0: red). The box plots in the Figure show that the predicted probability of the outcome is indeed higher for those with the outcome but there is also overlap between the two distribution which lead to an imperfect discrimination. -
![Prediction Distribution Box Plot](predictionDistribution.png)
- - \newpage -## Test-Train similarity +
+ +![Prediction Distribution Box Plot](predictionDistribution.wwebp) + +
+ +\newpage + +\## Test-Train similarity The test-train similarity is assessed by plotting the mean covariate values in the train set against those in the test set for people with and without the outcome. The results for our example of look very promising since the mean values of the covariates are on the diagonal. -
![Similarity plots of train and test set](generalizability.png)
- - \newpage -## Variable scatter plot +
+ +![Similarity plots of train and test set](generalizability.webp) + +
+ +\newpage + +\## Variable scatter plot The variable scatter plot shows the mean covariate value for the people with the outcome against the mean covariate value for the people without the outcome. The color of the dots corresponds to the inclusion (green) or exclusion in the model (blue), respectively. It is highly recommended to use the Shiny App since this allows you to hoover over a covariate to show more details (name, value etc). -The plot shows that the mean of most of the covariates is higher for subjects with the outcome compared to those without. +The plot shows that the mean of most of the covariates is higher for subjects with the outcome compared to those without. -
![Variabel scatter Plot](variableScatterplot.png)
- - \newpage -## Precision recall +
+ +![Variabel scatter Plot](variableScatterplot.webp) + +
+ +\newpage + +\## Precision recall Precision (P) is defined as the number of true positives (Tp) over the number of true positives plus the number of false positives (Fp). ```{r tidy=TRUE,eval=FALSE} P <- Tp/(Tp+Fp) ``` + Recall (R) is defined as the number of true positives (Tp) over the number of true positives plus the number of false negatives (Fn). + ```{r tidy=TRUE,eval=FALSE} R <- Tp/(Tp + Fn) ``` + These quantities are also related to the (F1) score, which is defined as the harmonic mean of precision and recall. + ```{r tidy=TRUE,eval=FALSE} F1 <- 2*P*R/(P+R) ``` -Note that the precision can either decrease or increase if the threshold is lowered. Lowering the threshold of a classifier may increase the denominator, by increasing the number of results returned. If the threshold was previously set too high, the new results may all be true positives, which will increase precision. If the previous threshold was about right or too low, further lowering the threshold will introduce false positives, decreasing precision. + +Note that the precision can either decrease or increase if the threshold is lowered. Lowering the threshold of a classifier may increase the denominator, by increasing the number of results returned. If the threshold was previously set too high, the new results may all be true positives, which will increase precision. If the previous threshold was about right or too low, further lowering the threshold will introduce false positives, decreasing precision. For Recall the denominator does not depend on the classifier threshold (Tp+Fn is a constant). This means that lowering the classifier threshold may increase recall, by increasing the number of true positive results. It is also possible that lowering the threshold may leave recall unchanged, while the precision fluctuates. -
![Precision Recall Plot](precisionRecall.png)
- - \newpage -## Demographic summary +
+ +![Precision Recall Plot](precisionRecall.webp) + +
+ +\newpage + +\## Demographic summary This plot shows for females and males the expected and observed risk in different age groups together with a confidence area. The results show that our model is well calibrated across gender and age groups. -
![Demographic Summary Plot](demographicSummary.png)
- - - \newpage -# External validation +
+ +![Demographic Summary Plot](demographicSummary.webp) + +
+ +\newpage + +\# External validation We recommend to always perform external validation, i.e. apply the final model on as much new datasets as feasible and evaluate its performance. @@ -1146,30 +1221,29 @@ externalValidateDbPlp( ``` -This will extract the new plpData from the specified schemas and cohort tables. It will then apply the same population settings and the trained plp model. Finally, it will evaluate the performance and return the standard output as `validation$performanceEvaluation` and it will also return the prediction on the population as `validation$prediction`. They can be inserted into the shiny app for viewing the model and validation by running: `viewPlp(runPlp=plpResult, validatePlp=validation )`. - +This will extract the new plpData from the specified schemas and cohort tables. It will then apply the same population settings and the trained plp model. Finally, it will evaluate the performance and return the standard output as `validation$performanceEvaluation` and it will also return the prediction on the population as `validation$prediction`. They can be inserted into the shiny app for viewing the model and validation by running: `viewPlp(runPlp=plpResult, validatePlp=validation )`. \newpage + # Other functionality The package has much more functionality than described in this vignette and contributions have been made my many persons in the OHDSI community. The table below provides an overview: - - Functionality | Description | Vignette ---------------- | --------------------------------------------- | --------------- -Builing Multiple Models | This vignette describes how you can run multiple models automatically | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) -Custom Models | This vignette describes how you can add your own custom algorithms in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) -Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf) -Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf) -Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf) -Ensemble models | This vignette describes how you can use the framework to build ensemble models, i.e combine multiple models in a super learner | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf) -Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/GeneratingLearningCurves.pdf) +| Functionality | Description | Vignette | +|-----------------|--------------------------------------|-----------------| +| Builing Multiple Models | This vignette describes how you can run multiple models automatically | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf) | +| Custom Models | This vignette describes how you can add your own custom algorithms in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf) | +| Custom Splitting Functions | This vignette describes how you can add your own custom training/validation/testing splitting functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf) | +| Custom Sampling Functions | This vignette describes how you can add your own custom sampling functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf) | +| Custom Feature Engineering/Selection | This vignette describes how you can add your own custom feature engineering and selection functions in the framework | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf) | +| Ensemble models | This vignette describes how you can use the framework to build ensemble models, i.e combine multiple models in a super learner | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf) | +| Learning curves | Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. | [`Vignette`](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/GeneratingLearningCurves.pdf) | # Demos We have added several demos in the package that run on simulated data: - -``` {r eval=FALSE} + +```{r eval=FALSE} # Show all demos in our package: demo(package = "PatientLevelPrediction") @@ -1194,47 +1268,73 @@ citation("Cyclops") ``` **Please reference this paper if you use the PLP Package in your work:** - - [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) + +[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) This work is supported in part through the National Science Foundation grant IIS 1251151. \newpage -# Appendix 1: Study population settings details {-} +# Appendix 1: Study population settings details {.unnumbered} In the figures below the effect is shown of the removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes booleans on the final study population. We start with a Target Cohort with firstExposureOnly = false and we require a washout period = 1095. We then subset the target cohort based on additional constraints. The final study population in the Venn diagrams below are colored green. 1)
-
Require minimum time-at-risk for all person in the target cohort
-
![](popdef1.png)
-
- -2)
-
Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk.
-
![](popdef2.png)
-
- - \newpage -3)
-
Include all persons in the target cohort exclude persons with prior outcomes
-
![](popdef3.png)
-
- -4)
-
Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes
-
![](popdef4.png)
-
- - \newpage -5)
-
Include all persons in target cohort exclude persons with prior outcomes
-
![](popdef5.png)
-
- -6)
-
Include all persons in target cohort
-
![](popdef6.png)
-
- - + +
Require minimum time-at-risk for all person in the target cohort
+ +
![](popdef1.webp)
+ + + +2)
+ +
Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk.
+ +
![](popdef2.webp)
+ +
+ +```{=tex} +\newpage +3 +``` +) + +
+ +
Include all persons in the target cohort exclude persons with prior outcomes
+ +
![](popdef3.webp)
+ +
+ +4)
+ +
Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes
+ +
![](popdef4.webp)
+ +
+ +```{=tex} +\newpage +5 +``` +) + +
+ +
Include all persons in target cohort exclude persons with prior outcomes
+ +
![](popdef5.webp)
+ +
+ +6)
+ +
Include all persons in target cohort
+ +
![](popdef6.webp)
+ +
diff --git a/vignettes/ClinicalModels.rmd b/vignettes/ClinicalModels.rmd new file mode 100644 index 000000000..3b6a5e5ae --- /dev/null +++ b/vignettes/ClinicalModels.rmd @@ -0,0 +1,46 @@ +--- +title: "Clinical Models" +author: "Jenna Reps, Peter R. Rijnbeek" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyhead[CO,CE]{Installation Guide} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` + +## Clinical models developed using the OHDSI PatientLevelPrediction framework + +| Title | Link | +|----------------------|-------| +| Using Machine Learning Applied to Real-World Healthcare Data for Predictive Analytics: An Applied Example in Bariatric Surgery | [Value in Health](https://www.sciencedirect.com/science/article/pii/S1098301519300737) | +| Development and validation of a prognostic model predicting symptomatic hemorrhagic transformation in acute ischemic stroke at scale in the OHDSI network | [PLoS One](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0226718) | +| Wisdom of the CROUD: development and validation of a patient-level prediction model for opioid use disorder using population-level claims data | [PLoS One](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0228632) | +| Developing predictive models to determine Patients in End-of-life Care in Administrative datasets | [Drug Safety](https://link.springer.com/article/10.1007/s40264-020-00906-7) | +| Predictors of diagnostic transition from major depressive disorder to bipolar disorder: a retrospective observational network study | [Translational psychiatry](https://www.nature.com/articles/s41398-021-01760-6) | +| Seek COVER: using a disease proxy to rapidly develop and validate a personalized risk calculator for COVID-19 outcomes in an international network | [BMC Medical Research Methodology](https://link.springer.com/article/10.1186/s12874-022-01505-z) | +| 90-Day all-cause mortality can be predicted following a total knee replacement: an international, network study to develop and validate a prediction model | [Knee Surgery, Sports Traumatology, Arthroscopy](https://link.springer.com/article/10.1007/s00167-021-06799-y) | +| Machine learning and real-world data to predict lung cancer risk in routine care | [Cancer Epidemiology, Biomarkers & Prevention](https://aacrjournals.org/cebp/article-abstract/32/3/337/718495) | +| Development and validation of a patient-level model to predict dementia across a network of observational databases | [BMC medicine](https://link.springer.com/article/10.1186/s12916-024-03530-9) | \ No newline at end of file diff --git a/vignettes/ConstrainedPredictors.Rmd b/vignettes/ConstrainedPredictors.Rmd new file mode 100644 index 000000000..226bbca06 --- /dev/null +++ b/vignettes/ConstrainedPredictors.Rmd @@ -0,0 +1,125 @@ +--- +title: "Constrained predictors" +author: "Jenna Reps" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyhead[CO,CE]{Installation Guide} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` +## Constrained Predictors + +### How to use the PhenotypeLibrary R package + +Here we provide a set of phenotypes that can be used as predictors in prediction models or best practice research. + +These phenotypes can be extracted from the PhenotypeLibrary R package. To install the R package run: + +```{r, echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval=FALSE} +remotes::install_github('ohdsi/PhenotypeLibrary') +``` + +To extract the cohort definition for Alcoholism with an id of 1165, just run: + +```{r echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval = FALSE} +PhenotypeLibrary::getPlCohortDefinitionSet(1165) +``` + +in general you can extract all the cohorts by running: + +```{r echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval = FALSE} +phenotypeDefinitions <- PhenotypeLibrary::getPlCohortDefinitionSet(1152:1215) +``` + +### The full set of predictor phenotypes + +| Phenotype Name | Disorder classification | OHDSI Phenotype library ID | +|--------------------------------|--------------------|-------------------| +| Alcoholism | Behavioral | 1165 | +| Smoking | Behavioral | 1166 | +| Anemia | Blood | 1188 | +| Osteoarthritis | Bone | 1184 | +| Osteoporosis | Bone | 1185 | +| Cancer | Cancer | 1215 | +| Atrial fibrillation | Cardiovascular | 1160 | +| Congestive heart failure | Cardiovascular | 1154 | +| Coronary artery disease | Cardiovascular | 1162 | +| Heart valve disorder | Cardiovascular | 1172 | +| Hyperlipidemia | Cardiovascular | 1170 | +| Hypertension | Cardiovascular | 1198 | +| Angina | Cardiovascular | 1159 | +| Skin Ulcer | Debility | 1168 | +| Diabetes type 1 | Endocrine | 1193 | +| Diabetes type 2 | Endocrine | 1194 | +| Hypothyroidism | Endocrine | 1171 | +| Obesity | Endocrine | 1179 | +| Gastroesophageal reflux disease (GERD) | GI | 1178 | +| Gastrointestinal (GI) bleed | GI | 1197 | +| Inflammatory bowel disorder (IBD) | GI/Rheumatology | 1180 | +| Hormonal contraceptives | Gynecologic | 1190 | +| Antibiotics Aminoglycosides | Infection | 1201 | +| Antibiotics Carbapenems | Infection | 1202 | +| Antibiotics Cephalosporins | Infection | 1203 | +| Antibiotics Fluoroquinolones | Infection | 1204 | +| Antibiotics Glycopeptides and lipoglycopeptides | Infection | 1205 | +| Antibiotics Macrolides | Infection | 1206 | +| Antibiotics Monobactams | Infection | 1207 | +| Antibiotics Oxazolidinones | Infection | 1208 | +| Antibiotics Penicillins | Infection | 1209 | +| Antibiotics Polypeptides | Infection | 1210 | +| Antibiotics Rifamycins | Infection | 1211 | +| Antibiotics Sulfonamides | Infection | 1212 | +| Antibiotics Streptogramins | Infection | 1213 | +| Antibiotics Tetracyclines | Infection | 1214 | +| Pneumonia | Infection/Respiratory | 1199 | +| Sepsis | Infection | 1176 | +| Urinary tract infection (UTI) | Infection | 1186 | +| Hepatitis | Liver | 1169 | +| Anxiety | Mood | 1189 | +| Depression (MDD) | Mood | 1161 | +| Psychotic disorder | Mood | 1175 | +| Antiepileptics (pain) | Neurology/Pain | 1183 | +| Seizure | Neurology | 1153 | +| Hemorrhagic stroke | Neurology/Vascular | 1156 | +| Non-hemorrhagic stroke | Neurology/Vascular | 1155 | +| Acetaminophen prescription | Pain/Infection | 1187 | +| Low back pain | Pain | 1173 | +| Neuropathy | Pain/Neurology | 1174 | +| Opioids | Pain | 1182 | +| Acute kidney injury | Kidney | 1163 | +| Chronic kidney disease | Kidney | 1191 | +| Asthma | Respiratory | 1164 | +| Chronic obstructive pulmonary disorder (COPD) | Respiratory | 1192 | +| Dyspnea | Respiratory | 1195 | +| Respiratory failure | Respiratory | 1177 | +| Sleep apnea | Respiratory | 1167 | +| Rheumatoid arthritis | Rheumatology | 1200 | +| Steroids | Rheumatology/Pain/Pulmonary | 1181 | +| Peripheral vascular disease | Vascular | 1157 | +| Aspirin | Vascular | 1158 | +| Deep vein thrombosis (DVT) | Vascular | 1152 | +| Edema | Vascular | 1196 | +| Inpatient visit | NA | NA | diff --git a/vignettes/CovCNN.png b/vignettes/CovCNN.png deleted file mode 100644 index 82dd2832f..000000000 Binary files a/vignettes/CovCNN.png and /dev/null differ diff --git a/vignettes/CreatingLearningCurves.Rmd b/vignettes/CreatingLearningCurves.Rmd index 74a9e36d5..0db8d4a79 100644 --- a/vignettes/CreatingLearningCurves.Rmd +++ b/vignettes/CreatingLearningCurves.Rmd @@ -25,12 +25,13 @@ output: word_document: toc: yes --- + +```{=html} - - +``` ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` @@ -47,7 +48,7 @@ if (file.exists(vignetteDataFolder)){ # Introduction -This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to create learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). +This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package to create learning curves. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). Prediction models will show overly-optimistic performance when predicting on the same data as used for training. Therefore, best-practice is to partition our data into a training set and testing set. We then train our prediction model on the training set portion and asses its ability to generalize to unseen data by measuring its performance on the testing set. @@ -57,9 +58,7 @@ Learning curves assess the effect of training set size on model performance by t Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance. -The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem. - - +The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem. ![Prediction model suffering from high variance.](learningCurveVariance.png) @@ -107,7 +106,7 @@ modelSettings <- setLassoLogisticRegression() ``` -Specify the split settings and a sequence of training set fractions (these over ride the splitSetting trainFraction). Alternatively, instead of `trainFractions`, you can provide a sequence of training events (`trainEvents`) instead of the training set fractions. This is recommended, because our research has shown that number of events is the important determinant of model performance. Make sure that your training set contains the number of events specified. +Specify the split settings and a sequence of training set fractions (these over ride the splitSetting trainFraction). Alternatively, instead of `trainFractions`, you can provide a sequence of training events (`trainEvents`) instead of the training set fractions. This is recommended, because our research has shown that number of events is the important determinant of model performance. Make sure that your training set contains the number of events specified. ```{r eval = FALSE} @@ -123,7 +122,6 @@ trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions # trainEvents <- seq(100, 5000, 100) ``` - Create the learning curve object. ```{r eval=FALSE} @@ -170,9 +168,9 @@ plotLearningCurve( ![Learning curve plot.](learningCurvePlot.png) - # Parallel processing -The learning curve object can be created in parallel, which can reduce computation time significantly. Whether to run the code in parallel or not is specified using the `parallel` input. Currently this functionality is only available for LASSO logistic regression and gradient boosting machines. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis. + +The learning curve object can be created in parallel, which can reduce computation time significantly. Whether to run the code in parallel or not is specified using the `parallel` input. Currently this functionality is only available for LASSO logistic regression and gradient boosting machines. Depending on the number of parallel workers it may require a significant amount of memory. We advise to use the parallelized learning curve function for parameter search and exploratory data analysis. When running in parrallel, R will find the number of available processing cores automatically and register the required parallel backend. Alternatively, you can provide the number of cores you wish to use via the `cores` input. @@ -180,7 +178,7 @@ When running in parrallel, R will find the number of available processing cores We have added a demo of the learningcurve: -``` {r eval=FALSE} +```{r eval=FALSE} # Show all demos in our package: demo(package = "PatientLevelPrediction") @@ -192,8 +190,7 @@ Do note that running this demo can take a considerable amount of time (15 min on # Publication -A publication titled 'How little data do we need for patient-level prediction?' uses the learning curve functionality in this package and can be accessed as preprint in the arXiv archives at -[https://arxiv.org/abs/2008.07361](https://arxiv.org/abs/2008.07361). +A publication titled 'How little data do we need for patient-level prediction?' uses the learning curve functionality in this package and can be accessed as preprint in the arXiv archives at . # Acknowledgments diff --git a/vignettes/CreatingNetworkStudies.Rmd b/vignettes/CreatingNetworkStudies.Rmd index 26204f34f..f35961d35 100644 --- a/vignettes/CreatingNetworkStudies.Rmd +++ b/vignettes/CreatingNetworkStudies.Rmd @@ -14,17 +14,19 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` ```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE} library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" # Load all needed data if it exists on this computer: if (file.exists(vignetteDataFolder)){ - plpModel <- loadPlpModel(vignetteDataFolder,'model') + plpModel <- loadPlpModel(file.path(vignetteDataFolder,'model')) lrResults <- loadPlpModel(file.path(vignetteDataFolder,'results')) } ``` @@ -32,49 +34,50 @@ if (file.exists(vignetteDataFolder)){ ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` + \newpage + # Introduction -The OHDSI Patient Level Prediction (PLP) package provides the framework to implement prediction models at scale. This can range from developing a large number of models across sites (methodology and study design insight) to extensive external validation of existing models in the OHDSI PLP framework (model insight). This vignette describes how you can use the `PatientLevelPrediction` package to create a network study package. + +The OHDSI Patient Level Prediction (PLP) package provides the framework to implement prediction models at scale. This can range from developing a large number of models across sites (methodology and study design insight) to extensive external validation of existing models in the OHDSI PLP framework (model insight). This vignette describes how you can use the `PatientLevelPrediction` package to create a network study package. # Useful publication -The open access publication [A standardized analytics pipeline for reliable and rapid development and validation of prediction models using observational health data](https://pubmed.ncbi.nlm.nih.gov/34560604/) details the process used to develop and validate prediction models using the OHDSI prediction framework and tools. This publication describes each of the steps and then demonstrates these by focusing on predicting death in those who have covid-19. + +The open access publication [A standardized analytics pipeline for reliable and rapid development and validation of prediction models using observational health data](https://pubmed.ncbi.nlm.nih.gov/34560604/) details the process used to develop and validate prediction models using the OHDSI prediction framework and tools. This publication describes each of the steps and then demonstrates these by focusing on predicting death in those who have covid-19. # Main steps for running a network study + ## Step 1 – developing the study - * Design the study: target/outcome cohort logic, concept sets for medical definitions, settings for developing new model or validation of adding existing models to framework. Suggestion: look in literature for validated definitions. - * Write a protocol that motivates the study and provides full details (sufficient for people to replicate the study in the future). - * Write an R package for implementing the study across diverse computational environments [see guidance below for structure of package and use the skeleton github package here: https://github.com/OHDSI/SkeletonPredictionStudy ] +- Design the study: target/outcome cohort logic, concept sets for medical definitions, settings for developing new model or validation of adding existing models to framework. Suggestion: look in literature for validated definitions. +- Write a protocol that motivates the study and provides full details (sufficient for people to replicate the study in the future). +- Write an R package for implementing the study across diverse computational environments [see guidance below for structure of package and use the skeleton github package here: ] ## Step 2 – implementing the study part 1 - * Get contributors to install the package and dependencies. Ensure the package is installed correctly for each contributor by asking them to run the checkInstall functions (as specified in the InstallationGuide). - * Get contributors to run the createCohort function to inspect the target/outcome definitions. If the definitions are not suitable for a site, go back to step 1 and revise the cohort definitions. +- Get contributors to install the package and dependencies. Ensure the package is installed correctly for each contributor by asking them to run the checkInstall functions (as specified in the InstallationGuide). +- Get contributors to run the createCohort function to inspect the target/outcome definitions. If the definitions are not suitable for a site, go back to step 1 and revise the cohort definitions. ## Step 3 – implementing the study part 2 (make sure the package is functioning as planned and the definitions are valid across sites) - * Get contributors to run the main.R with the settings configured to their environment - * Get the contributors to submit the results +- Get contributors to run the main.R with the settings configured to their environment +- Get the contributors to submit the results ## Step 4 – Publication -The study creator has the first option to be first author, if he/she does not wish to be first author then he/she can pick the most suitable person from the contributors. All contributors will be listed as authors on the paper. The last author will be the person who lead/managed the study, if this was the first author then the first author can pick the most suitable last author. All authors between the first and last author will be alphabetical by last name. - +The study creator has the first option to be first author, if he/she does not wish to be first author then he/she can pick the most suitable person from the contributors. All contributors will be listed as authors on the paper. The last author will be the person who lead/managed the study, if this was the first author then the first author can pick the most suitable last author. All authors between the first and last author will be alphabetical by last name. # Package Skeleton - File Structure - * DESCRIPTION: This file describes the R package and the dependencies - * NAMESPACE: This file is created automatically by Roxygen - * Readme.md: This file should provide the step by step guidance on implementing the package - * R - - helpers.r: all the custom functions used by the package should be in this file (e.g., checkInstall) - - main.r: this file will call the functions in helpers.r to execute the full study - - submit.r: this file will be called at the end to submit the compressed folder to the study creator/manager. - * Man: this folder will contain the documentation for the functions in helpers.r (this should be automatically generated by roxygen) - * Inst - - sql/sql_sever - * targetCohort: the target cohort parameterised sql code - * outcomeCohort: the outcome cohort parameterised sql code - - plp_models: place any PLP models here - * Extras - +- DESCRIPTION: This file describes the R package and the dependencies +- NAMESPACE: This file is created automatically by Roxygen +- Readme.md: This file should provide the step by step guidance on implementing the package +- R +- helpers.r: all the custom functions used by the package should be in this file (e.g., checkInstall) +- main.r: this file will call the functions in helpers.r to execute the full study +- submit.r: this file will be called at the end to submit the compressed folder to the study creator/manager. +- Man: this folder will contain the documentation for the functions in helpers.r (this should be automatically generated by roxygen) +- Inst +- sql/sql_sever \* targetCohort: the target cohort parameterised sql code \* outcomeCohort: the outcome cohort parameterised sql code +- plp_models: place any PLP models here +- Extras diff --git a/vignettes/CreatingShinyApp.Rmd b/vignettes/CreatingShinyApp.Rmd deleted file mode 100644 index 59b1e099d..000000000 --- a/vignettes/CreatingShinyApp.Rmd +++ /dev/null @@ -1,298 +0,0 @@ ---- -title: "Creating Shiny App" -author: "Jenna Reps" -date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[CO,CE]{Installation Guide} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[LE,RO]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - word_document: - toc: yes - html_document: - number_sections: yes - toc: yes ---- - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -# Introduction -In this vignette we will show with example code how to create a shiny app and add the shiny app online for other researcher around the whole to explore. - -There are two ways to create the shiny app: -1) Using the atlas R generated prediction package -2) Manually using the PatientLevelPrediction functions in a script - -We assume you have experience with using the OHDSI PatientLevelPrediction package to develop and externally validate prediction models using data in the OMOP CDM. If you do not have experience with this then please first read our general vignette [`BuildingPredictiveModels` vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). - -# Atlas Development Shiny App - -## Step 1: Run the model development package to get results -To create a shiny app project via the Atlas auto-generated prediction R package you named 'myPackage' you need to run the execute function: -```{r eval = FALSE} -library(myPackage) -myPackage::execute(connectionDetails = connectionDetails, - cdmDatabaseSchema = 'myDatabaseSchema.dbo', - cdmDatabaseName = 'MyDatabase', - cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results', - cohortTable = 'cohort', - outputFolder = 'C:/myResults', - createProtocol = F, - createCohorts = F, - runAnalyses = T, - createResultsDoc = F, - packageResults = F, - createValidationPackage = F, - minCellCount= 5, - createShiny = F, - createJournalDocument = F, - analysisIdDocument = 1) -``` - -This will extract data based on the settings you supplied in the Atlas prediction design from cohort tables already generated in your CDM database schema. The PatientLevelPrediction framework will then run and develop/evaluate models saving the results to the location specified by outputFolder (e.g., 'C:/myResults'). - -## Step 2: Create the shiny app -To create a shiny app project with these results you can then simply run: -```{r eval = FALSE} -myPackage::execute(connectionDetails = connectionDetails, - cdmDatabaseSchema = 'myDatabaseSchema.dbo', - cdmDatabaseName = 'MyDatabase', - cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results', - cohortTable = 'cohort', - outputFolder = 'C:/myResults', - minCellCount= 5, - createShiny = T) -``` - -making sure the outputFolder is the same location used when you ran the analysis. This code populates a shiny app project with the results but removes sensitive date such as connection settings, the cdmDatabaseSchema setting, the predicton matrix and any sensitive counts less than 'minCellCount' from the covariate summary and performance evalaution. - -The shiny app project populated with the model development results can then be found at '[outputFolder]/ShinyApp' e.g., 'C:/myResults/ShinyApp'. - - -### Testing (Optional but recommended) -You can test the app by opening the shiny project within the [outputFolder]/ShinyApp' folder, double click on the file named 'PLPViewer.Rproj'. This will open an R studio session with the shiny app project loaded. Now load the 'ui.R' files within this R studio session and you will see a green arrow with the words 'Run App' at the top right of the script. Click on this and the shiny app with open. Note: You may need to install some R pacakge dependancies for the shiny app to work. - -## Step 3: Sharing the shiny app -Once you are happy with your app, you can publish it onto https://data.ohdsi.org by adding the folder 'ShinyApp' to the OHDSI githib ShinyDeploy (https://github.com/OHDSI/ShinyDeploy/). Continuing the example, we would copy the folder '[outputFolder]/ShinyApp' and paste it to the local github clone of ShinyDeploy. We recommend renaming the folder from 'ShinyApp' to a name that describes your prediction, e.g., 'StrokeinAF'. Then commit the changes and make a pull request to ShinyDeploy. Once accepted your shiny app will be viewable at 'https://data.ohdsi.org'. If you commited the folder named 'StrokeInAF' then the shiny app will be hosted at 'https://data.ohdsi.org/StrokeInAF'. - - -# Atlas External Validation - -To include external validation results you can use the Atlas generated R study package to create the external validation package: -```{r eval = FALSE} -myPackage::execute(connectionDetails = connectionDetails, - cdmDatabaseSchema = 'myDatabaseSchema.dbo', - cdmDatabaseName = 'MyDatabase', - cohortDatabaseSchema = 'myDatabaseSchema.ohdsi_results', - cohortTable = 'cohort', - outputFolder = 'C:/myResults', - createValidationPackage = T) -``` - -This will create a new R package inside the 'outputFolder' location with the word 'Validation' appended the name of your development package. For example, if your 'outputFolder' was 'C:/myResults' and your development package was named 'myPackage' then the validation package will be found at: 'C:/myResults/myPackageValidation'. When running the valdiation package make sure to set the 'outputFolder' to the Validation folder within your model development outputFolder location: - -```{r eval = FALSE} -myPackageValidation::execute(connectionDetails = connectionDetails, - databaseName = databaseName, - cdmDatabaseSchema = cdmDatabaseSchema, - cohortDatabaseSchema = cohortDatabaseSchema, - oracleTempSchema = oracleTempSchema, - cohortTable = cohortTable, - outputFolder = 'C:/myResults/Validation', - createCohorts = T, - runValidation = T, - packageResults = F, - minCellCount = 5, - sampleSize = NULL) - -``` - -Now you can rerun Steps 2-3 to populate the shiny app project that will also include the validation results (as long as the validation results are in the Validation folder found in the Step 1 outputFolder location e.g., in 'C:/myResults/Validation'). - - -# Combining multiple atlas results into one shiny app: -The code below can be used to combine multiple Atlas packages' results into one shiny app: - -```{r eval = FALSE} -populateMultipleShinyApp <- function(shinyDirectory, - resultDirectory, - minCellCount = 10, - databaseName = 'sharable name of development data'){ - - #check inputs - if(missing(shinyDirectory)){ - shinyDirectory <- system.file("shiny", "PLPViewer", package = "SkeletonPredictionStudy") - } - if(missing(resultDirectory)){ - stop('Need to enter the resultDirectory') - } - - - for(i in 1:length(resultDirectory)){ - if(!dir.exists(resultDirectory[i])){ - stop(paste('resultDirectory ',i,' does not exist')) - } - } - - outputDirectory <- file.path(shinyDirectory,'data') - - # create the shiny data folder - if(!dir.exists(outputDirectory)){ - dir.create(outputDirectory, recursive = T) - } - - - # need to edit settings ... - files <- c() - for(i in 1:length(resultDirectory)){ - # copy the settings csv - file <- utils::read.csv(file.path(resultDirectory[i],'settings.csv')) - file$analysisId <- 1000*as.double(file$analysisId)+i - files <- rbind(files, file) - } - utils::write.csv(files, file.path(outputDirectory,'settings.csv'), row.names = F) - - for(i in 1:length(resultDirectory)){ - # copy each analysis as a rds file and copy the log - files <- dir(resultDirectory[i], full.names = F) - files <- files[grep('Analysis', files)] - for(file in files){ - - if(!dir.exists(file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i)))){ - dir.create(file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i))) - } - - if(dir.exists(file.path(resultDirectory[i],file, 'plpResult'))){ - res <- PatientLevelPrediction::loadPlpResult(file.path(resultDirectory[i],file, 'plpResult')) - res <- PatientLevelPrediction::transportPlp(res, n= minCellCount, - save = F, dataName = databaseName[i]) - - res$covariateSummary <- res$covariateSummary[res$covariateSummary$covariateValue!=0,] - covSet <- res$model$metaData$call$covariateSettings - res$model$metaData <- NULL - res$model$metaData$call$covariateSettings <- covSet - res$model$predict <- NULL - if(!is.null(res$performanceEvaluation$evaluationStatistics)){ - res$performanceEvaluation$evaluationStatistics[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i) - } else{ - writeLines(paste0(resultDirectory[i],file, '-ev')) - } - if(!is.null(res$performanceEvaluation$thresholdSummary)){ - res$performanceEvaluation$thresholdSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i) - }else{ - writeLines(paste0(resultDirectory[i],file, '-thres')) - } - if(!is.null(res$performanceEvaluation$demographicSummary)){ - res$performanceEvaluation$demographicSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i) - } else{ - writeLines(paste0(resultDirectory[i],file, '-dem')) - } - if(!is.null(res$performanceEvaluation$calibrationSummary)){ - res$performanceEvaluation$calibrationSummary[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i) - }else{ - writeLines(paste0(resultDirectory[i],file, '-cal')) - } - if(!is.null(res$performanceEvaluation$predictionDistribution)){ - res$performanceEvaluation$predictionDistribution[,1] <- paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i) - }else{ - writeLines(paste0(resultDirectory[i],file, '-dist')) - } - saveRDS(res, file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i), 'plpResult.rds')) - } - if(file.exists(file.path(resultDirectory[i],file, 'plpLog.txt'))){ - file.copy(from = file.path(resultDirectory[i],file, 'plpLog.txt'), - to = file.path(outputDirectory,paste0('Analysis_',1000*as.double(gsub('Analysis_','',file))+i), 'plpLog.txt')) - } - } - } - - - - for(i in 1:length(resultDirectory)){ - # copy any validation results - if(dir.exists(file.path(resultDirectory[i],'Validation'))){ - valFolders <- dir(file.path(resultDirectory[i],'Validation'), full.names = F) - - if(length(valFolders)>0){ - # move each of the validation rds - for(valFolder in valFolders){ - - # get the analysisIds - valSubfolders <- dir(file.path(resultDirectory[i],'Validation',valFolder), full.names = F) - if(length(valSubfolders)!=0){ - for(valSubfolder in valSubfolders ){ - valSubfolderUpdate <- paste0('Analysis_', as.double(gsub('Analysis_','', valSubfolder))*1000+i) - valOut <- file.path(valFolder,valSubfolderUpdate) - valOutOld <- file.path(valFolder,valSubfolder) - if(!dir.exists(file.path(outputDirectory,'Validation',valOut))){ - dir.create(file.path(outputDirectory,'Validation',valOut), recursive = T) - } - - - if(file.exists(file.path(resultDirectory[i],'Validation',valOutOld, 'validationResult.rds'))){ - res <- readRDS(file.path(resultDirectory[i],'Validation',valOutOld, 'validationResult.rds')) - res <- PatientLevelPrediction::transportPlp(res, n= minCellCount, - save = F, dataName = databaseName[i]) - res$covariateSummary <- res$covariateSummary[res$covariateSummary$covariateValue!=0,] - saveRDS(res, file.path(outputDirectory,'Validation',valOut, 'validationResult.rds')) - } - } - } - - } - - } - - } - } - - return(outputDirectory) - -} -``` - -## Example code to combine multiple results -The following code will combine the results found in 'C:/myResults', 'C:/myResults2' and 'C:/myResults3' into the shiny project at 'C:/R/library/myPackage/shiny/PLPViewer': -```{r eval = FALSE} -populateMultipleShinyApp(shinyDirectory = 'C:/R/library/myPackage/shiny/PLPViewer', - resultDirectory = c('C:/myResults', - 'C:/myResults2', - 'C:/myResults3'), - minCellCount = 0, - databaseName = c('database1','database2','database3')) - -``` - - -# Manual App Creation -[instructions coming soon] - - - -# Acknowledgments - -Considerable work has been dedicated to provide the `PatientLevelPrediction` package. - -```{r tidy=TRUE,eval=TRUE} -citation("PatientLevelPrediction") -``` - -**Please reference this paper if you use the PLP Package in your work:** - -[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) diff --git a/vignettes/Figure1.png b/vignettes/Figure1.png deleted file mode 100644 index 878a509e2..000000000 Binary files a/vignettes/Figure1.png and /dev/null differ diff --git a/vignettes/Figure1.webp b/vignettes/Figure1.webp new file mode 100644 index 000000000..42ad71d7f Binary files /dev/null and b/vignettes/Figure1.webp differ diff --git a/vignettes/InstallationGuide.Rmd b/vignettes/InstallationGuide.Rmd index 2ce3e4972..91788b711 100644 --- a/vignettes/InstallationGuide.Rmd +++ b/vignettes/InstallationGuide.Rmd @@ -1,6 +1,6 @@ --- title: "Patient-Level Prediction Installation Guide" -author: "Jenna Reps, Peter R. Rijnbeek" +author: "Jenna Reps, Peter R. Rijnbeek, Egill Fridgeirsson" date: '`r Sys.Date()`' header-includes: - \usepackage{fancyhdr} @@ -23,54 +23,59 @@ output: number_sections: yes toc: yes --- + +```{=html} - +``` # Introduction -This vignette describes how you need to install the Observational Health Data Sciencs and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package under Windows, Mac, and Linux. + +This vignette describes how you need to install the Observational Health Data Science and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package under Windows, Mac, and Linux. # Software Prerequisites ## Windows Users + Under Windows the OHDSI Patient Level Prediction (PLP) package requires installing: -* R (https://cran.cnr.berkeley.edu/ ) \- (R >= 3.3.0, but latest is recommended) -* Rstudio (https://www.rstudio.com/ ) -* Java (http://www.java.com ) -* RTools (https://cran.r-project.org/bin/windows/Rtools/) +- R ( ) - (R \>= 4.0.0, but latest is recommended) +- Rstudio ( ) +- Java ( ) +- RTools () ## Mac/Linux Users -Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package requires installing: - -* R (https://cran.cnr.berkeley.edu/ ) \- (R >= 3.3.0, but latest is recommended) -* Rstudio (https://www.rstudio.com/ ) -* Java (http://www.java.com ) -* Xcode command line tools(run in terminal: xcode-select --install) [MAC USERS ONLY] +Under Mac and Linux the OHDSI Patient Level Prediction (PLP) package requires installing: +- R ( ) - (R \>= 4.0.0, but latest is recommended) +- Rstudio ( ) +- Java ( ) +- Xcode command line tools(run in terminal: xcode-select --install) [MAC USERS ONLY] # Installing the Package + The preferred way to install the package is by using `remotes`, which will automatically install the latest release and all the latest dependencies. -If you do not want the official release you could install the bleading edge version of the package (latest develop branch). +If you do not want the official release you could install the bleeding edge version of the package (latest develop branch). Note that the latest develop branch could contain bugs, please report them to us if you experience problems. ## Installing PatientLevelPrediction using remotes + To install using `remotes` run: + ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} install.packages("remotes") -remotes::install_github("OHDSI/FeatureExtraction") remotes::install_github("OHDSI/PatientLevelPrediction") ``` -When installing make sure to close any other Rstudio sessions that are using `PatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing. +When installing make sure to close any other Rstudio sessions that are using `PatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing. # Creating Python Reticulate Environment -Many of the classifiers in the `PatientLevelPrediction` use a Python back end. To set up a python environment run: +Many of the classifiers in the `PatientLevelPrediction` use a Python backend. To set up a python environment run: ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} library(PatientLevelPrediction) @@ -79,60 +84,40 @@ configurePython(envname='r-reticulate', envtype='conda') ``` -Some of the less frequently used classifiers are not installed during this set-up to add them run: +# Installation issues -For GBM survival: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp') +Installation issues need to be posted in our issue tracker: -``` +The list below provides solutions for some common issues: -# Testing installation -To test whether the package is installed correctly, using the test script in '/extras', run: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} +1. If you have an error when trying to install a package in R saying **'Dependancy X not available ...'** then this can sometimes be fixed by running `install.packages('X')` and then once that completes trying to reinstall the package that had the error. -# load the checkPlpInstallation function -library(devtools) -source_url('https://raw.github.com/OHDSI/PatientLevelPrediction/issue242/extras/checkPlpInstallation.R') - -# set up the database connection details -library(DatabaseConnector) -connectionDetails <- createConnectionDetails( - dbms = 'sql_server', - user = 'username', - password = 'hidden', - server = 'your server', - port = 'your port' - ) - -# run the test -checkPlpInstallation( - connectionDetails = connectionDetails, - python = T - ) -``` +2. I have found that using the github `remotes` to install packages can be impacted if you have **multiple R sessions** open as one session with a library open can cause the library to be locked and this can prevent an install of a package that depends on that library. -To test the installation (excluding python) run: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} +## Common issues + +### python environment Mac/linux users: -checkPlpInstallation( - connectionDetails = connectionDetails, - python = F - ) +to make sure R uses the r-reticulate python environment you may need to edit your .Rprofile with the location of the python binary for the PLP environment. Edit the .Rprofile by running: + +```{r eval=FALSE} +usethis::edit_r_profile() ``` -The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. -Moreover, it will test the database connection. +and add -# Installation issues -Installation issues need to be posted in our issue tracker: -http://github.com/OHDSI/PatientLevelPrediction/issues +```{r eval=FALSE} +Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":")) +``` -The list below provides solutions for some common issues: +to the file then save. Where your python bin location is the location returned by -1. If you have an error when trying to install a package in R saying **'Dependancy X not available ...'** then this can sometimes be fixed by running `install.packages('X')` and then once that completes trying to reinstall the package that had the error. +```{r eval=FALSE} +reticulate::conda_list() +``` -2. I have found that using the github `remotes`` to install packages can be impacted if you have **multiple R sessions** open as one session with a library open can cause the library to be locked and this can prevent an install of a package that depends on that library. +e.g., My PLP virtual environment location was /anaconda3/envs/PLP/bin/python so I added:\ +Sys.setenv(PATH = paste("/anaconda3/envs/PLP/bin", Sys.getenv("PATH"), sep=":")) # Acknowledgments @@ -146,5 +131,4 @@ citation("PatientLevelPrediction") [Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) - -This work is supported in part through the National Science Foundation grant IIS 1251151. \ No newline at end of file +This work is supported in part through the National Science Foundation grant IIS 1251151. diff --git a/vignettes/PatientLevelPrediction.rmd b/vignettes/PatientLevelPrediction.rmd deleted file mode 100644 index 3df52bbd6..000000000 --- a/vignettes/PatientLevelPrediction.rmd +++ /dev/null @@ -1,165 +0,0 @@ ---- -title: "Quick Install Guide" -author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek" -date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[CO,CE]{Installation Guide} - - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[LE,RO]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - word_document: - toc: yes - html_document: - number_sections: yes - toc: yes ---- - - -# Quick Install Guide -*** - -## Intalling the R package -The preferred way to install the package is by using drat, which will automatically install the latest release and all the latest dependencies. -If the drat code fails or you do not want the official release you could use devtools to install the bleading edge version of the package (latest master). -Note that the latest master could contain bugs, please report them to us if you experience problems. - -To install using drat run: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -install.packages("drat") -drat::addRepo("OHDSI") -install.packages("PatientLevelPrediction") -``` - -To install using devtools run: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -install.packages('devtools') -devtools::install_github("OHDSI/FeatureExtraction") -devtools::install_github('ohdsi/PatientLevelPrediction') -``` - -When installing using devtools make sure to close any other Rstudio sessions that are using PatientLevelPrediction or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing. - -## Setting up Python -Many of the classifiers in PatientLevelPrediction use python. To use the python classifiers you need to install and set up the a python environment in R. We used the reticulate package: - -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -library(PatientLevelPrediction) -reticulate::install_miniconda() -configurePython(envname='r-reticulate', envtype='conda') - -``` - -To add the R keras interface, in Rstudio run: - -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -devtools::install_github("rstudio/keras") -library(keras) -install_keras() -``` - -Some of the less frequently used classifiers are considered optional and are not installed by default. To install then, run: - -For GBM survival: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -reticulate::conda_install(envname='r-reticulate', packages = c('scikit-survival'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto", channel = 'sebp') - -``` - -For any of the torch models: -```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} -reticulate::conda_install(envname='r-reticulate', packages = c('pytorch', 'torchvision', 'cpuonly'), forge = TRUE, pip = FALSE, channel = 'pytorch', pip_ignore_installed = TRUE, conda = 'auto') - -``` - - -## Testing the PatientLevelPrediction Installation -To test whether the package is installed correctly run: -```{r eval=FALSE} -library(PatientLevelPrediction) -library(DatabaseConnector) -connectionDetails <- createConnectionDetails(dbms = 'sql_server', - user = 'username', - password = 'hidden', - server = 'your server', - port = 'your port') -PatientLevelPrediction::checkPlpInstallation(connectionDetails = connectionDetails, - python = T) -``` -To test the installation (excluding python) run: - -```{r eval=FALSE} -library(PatientLevelPrediction) -library(DatabaseConnector) -connectionDetails <- createConnectionDetails(dbms = 'sql_server', - user = 'username', - password = 'hidden', - server = 'your server', - port = 'your port') -PatientLevelPrediction::checkPlpInstallation(connectionDetails = connectionDetails, - python = F) -``` -The check can take a while to run since it will build the following models in sequence on simulated data: Logistic Regression, RandomForest, MLP, AdaBoost, Decision Tree, Naive Bayes, KNN, Gradient Boosting. Moreover, it will test the database connection. - - -# Common issues -## python environment Mac/linux users: -to make sure R uses the r-reticulate python environment you may need to edit your .Rprofile with the location of the python binary for the PLP environment. Edit the .Rprofile by running: -```{r eval=FALSE} -usethis::edit_r_profile() -``` -and add -```{r eval=FALSE} -Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":")) -``` -to the file then save. Where your python bin location is the location returned by -```{r eval=FALSE} -reticulate::conda_list() -``` -e.g., My PLP virtual environment location was /anaconda3/envs/PLP/bin/python so I added: -Sys.setenv(PATH = paste("/anaconda3/envs/PLP/bin", Sys.getenv("PATH"), sep=":")) - - -# Old Instructions - -## To configure python via anaconda -* Close your RStudio -* Install python 3.7 using anaconda (https://www.anaconda.com/download) [make sure you pick the correct operating system] and note the installation location. Anaconda should update you path variable with the python binary. -* Open a new Rstudio and check whether your python is configured correctly by running: -```{r eval=FALSE} -system("python --version") -``` -If set up correctly you should see "Python 3.x.x :: Anaconda, Inc." returned. - -* If not set up correctly then: - + Windows users: make sure your anaconda python binary is in the System PATH environmental variable: go to my computer -> system properties -> advanced system settings Then at the bottom right you’ll see a button: Environmental Variables, clicking on that will enable you to edit the PATH variable. Add the following Anaconda locations to your path: `D:\Anaconda3;D:\Anaconda3\Scripts;D:\Anaconda3\Library\bin` (this assumes your installation was done to `D:\Anaconda3`). - + Mac/Linux users: edit the bash profile to add python in the Path by running in the terminal: touch ~/.bash_profile; open ~/.bash_profile; and adding in the location of python in the PATH variable. Unfortunately, you also need to make an edit to the .Rprofile for R to get the correct PATH. To do this open the .Rprofile by running: -```{r eval=FALSE} - usethis::edit_r_profile() -``` -and in this file add -```{r eval=FALSE} -Sys.setenv(PATH = paste("your python bin location", Sys.getenv("PATH"), sep=":")) -``` -* After editing your Path or .Rprofile open a new Rstudio session and test that python is correctly set up via -```{r eval=FALSE} -system("python --version") -``` - - - - - - diff --git a/vignettes/Videos.rmd b/vignettes/Videos.rmd index 410697d7f..5f278c92b 100644 --- a/vignettes/Videos.rmd +++ b/vignettes/Videos.rmd @@ -23,184 +23,45 @@ output: number_sections: yes toc: yes --- + +```{=html} - - +``` ## What is a cohort table? - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn what a cohort table looks like and what columns are required. -
+| Click To Launch | Description of Demo | +|-----------------------------------------------|------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/GY2ZTcizY90) | Learn what a cohort table looks like and what columns are required. | ## Setting up a connection between your database and R - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. -
+| Click To Launch | Description of Demo | +|----------------------------------------|--------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/8F2X5SKN64w) | Learn how to configure the connection to your OMOP CDM data from R using the OHDSI DatabaseConnector package. | - - ## Running a single PatientLevelPrediction model - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn how to develop and validate a single PatientLevelPrediction model. -
- - + +| Click To Launch | Description of Demo | +|-----------------------------------------------|-------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/7AraOsTynD4/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7AraOsTynD4) | Learn how to develop and validate a single PatientLevelPrediction model. | + ## Running multiple PatientLevelPrediction models study - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn how to develop and validate multiple PatientLevelPrediction models. -
- - -## Designing a study in Atlas - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn how to design a multiple or single PatientLevelPrediction study using Atlas. Atlas creates an R package that just needs to be built and then you're on your way to developing multiple models! -
- -## Building and running the Atlas study - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn how to build the R package generated by Atlas and how to then run the study. -
+| Click To Launch | Description of Demo | +|-----------------------------------------------|-------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/7wUilx580PE/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/7wUilx580PE) | Learn how to develop and validate multiple PatientLevelPrediction models. | ## Exploring the results in the shiny app - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() -
- + +| Click To Launch | Description of Demo | +|---------------------------------------|---------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BulmuH32y_Y/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/BulmuH32y_Y) | Learn how to interactively explore the model performance and model via the shiny apps viewPlp() and viewMultiplePlp() | + ## Validating existing models on OMOP CDM data - - - - - - - - - -
-Click To Launch - -Description of Demo -
- - -This demo shows how you can add any existing score or logistic model and valdiate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. -
- \ No newline at end of file + +| Click To Launch | Description of Demo | +|--------------------------|----------------------------------------------| +| [![Video Vignette PLP Package](http://img.youtube.com/vi/BEukCbT8UoA/0.jpg){alt="Video Vignette PLP Package"}](https://youtu.be/oBsfg9hfrpI) | This demo shows how you can add any existing score or logistic model and validate the model on new OMOP CDM data. This is useful for benchmarking when developing new models or to perform extensive external validation of a model across the OHDSI network. | diff --git a/vignettes/arch1.png b/vignettes/arch1.png deleted file mode 100644 index e4846e56f..000000000 Binary files a/vignettes/arch1.png and /dev/null differ diff --git a/vignettes/atlasdownload1.png b/vignettes/atlasdownload1.png deleted file mode 100644 index ef6559fa9..000000000 Binary files a/vignettes/atlasdownload1.png and /dev/null differ diff --git a/vignettes/atlasdownload1.webp b/vignettes/atlasdownload1.webp new file mode 100644 index 000000000..6cac340ed Binary files /dev/null and b/vignettes/atlasdownload1.webp differ diff --git a/vignettes/atlasdownload2.png b/vignettes/atlasdownload2.png deleted file mode 100644 index 619f8c799..000000000 Binary files a/vignettes/atlasdownload2.png and /dev/null differ diff --git a/vignettes/atlasdownload2.webp b/vignettes/atlasdownload2.webp new file mode 100644 index 000000000..452c5ca21 Binary files /dev/null and b/vignettes/atlasdownload2.webp differ diff --git a/vignettes/atlasplp1.png b/vignettes/atlasplp1.png deleted file mode 100644 index 4b21b2143..000000000 Binary files a/vignettes/atlasplp1.png and /dev/null differ diff --git a/vignettes/atlasplp1.webp b/vignettes/atlasplp1.webp new file mode 100644 index 000000000..71a3c1ce9 Binary files /dev/null and b/vignettes/atlasplp1.webp differ diff --git a/vignettes/atlasplp2.png b/vignettes/atlasplp2.png deleted file mode 100644 index 6bc7b93ad..000000000 Binary files a/vignettes/atlasplp2.png and /dev/null differ diff --git a/vignettes/atlasplp2.webp b/vignettes/atlasplp2.webp new file mode 100644 index 000000000..668202de4 Binary files /dev/null and b/vignettes/atlasplp2.webp differ diff --git a/vignettes/atlasplp3.png b/vignettes/atlasplp3.png deleted file mode 100644 index 0911b31ea..000000000 Binary files a/vignettes/atlasplp3.png and /dev/null differ diff --git a/vignettes/atlasplp3.webp b/vignettes/atlasplp3.webp new file mode 100644 index 000000000..523d0143c Binary files /dev/null and b/vignettes/atlasplp3.webp differ diff --git a/vignettes/atlasplp4.png b/vignettes/atlasplp4.png deleted file mode 100644 index b5db1b153..000000000 Binary files a/vignettes/atlasplp4.png and /dev/null differ diff --git a/vignettes/atlasplp4.webp b/vignettes/atlasplp4.webp new file mode 100644 index 000000000..d90a43ea2 Binary files /dev/null and b/vignettes/atlasplp4.webp differ diff --git a/vignettes/atlasplp5.png b/vignettes/atlasplp5.png deleted file mode 100644 index df1c2eb15..000000000 Binary files a/vignettes/atlasplp5.png and /dev/null differ diff --git a/vignettes/cirenn.png b/vignettes/cirenn.png deleted file mode 100644 index f4e8ed054..000000000 Binary files a/vignettes/cirenn.png and /dev/null differ diff --git a/vignettes/cnn_lstm.png b/vignettes/cnn_lstm.png deleted file mode 100644 index a16e1417d..000000000 Binary files a/vignettes/cnn_lstm.png and /dev/null differ diff --git a/vignettes/cnn_mlf2.png b/vignettes/cnn_mlf2.png deleted file mode 100644 index 2b69c159b..000000000 Binary files a/vignettes/cnn_mlf2.png and /dev/null differ diff --git a/vignettes/conv_arch1.png b/vignettes/conv_arch1.png deleted file mode 100644 index 5970b3f1c..000000000 Binary files a/vignettes/conv_arch1.png and /dev/null differ diff --git a/vignettes/conv_arch2.png b/vignettes/conv_arch2.png deleted file mode 100644 index a51ccf08e..000000000 Binary files a/vignettes/conv_arch2.png and /dev/null differ diff --git a/vignettes/covcnn2.png b/vignettes/covcnn2.png deleted file mode 100644 index 0734a49eb..000000000 Binary files a/vignettes/covcnn2.png and /dev/null differ diff --git a/vignettes/demographicSummary.png b/vignettes/demographicSummary.png deleted file mode 100644 index 8ceafbee8..000000000 Binary files a/vignettes/demographicSummary.png and /dev/null differ diff --git a/vignettes/demographicSummary.webp b/vignettes/demographicSummary.webp new file mode 100644 index 000000000..7d0437deb Binary files /dev/null and b/vignettes/demographicSummary.webp differ diff --git a/vignettes/example1/ATLAS_O.PNG b/vignettes/example1/ATLAS_O.PNG deleted file mode 100644 index 3cda2abf7..000000000 Binary files a/vignettes/example1/ATLAS_O.PNG and /dev/null differ diff --git a/vignettes/example1/ATLAS_O.webp b/vignettes/example1/ATLAS_O.webp new file mode 100644 index 000000000..85e63dc9e Binary files /dev/null and b/vignettes/example1/ATLAS_O.webp differ diff --git a/vignettes/example1/ATLAS_T.PNG b/vignettes/example1/ATLAS_T.PNG deleted file mode 100644 index 8be57dc9e..000000000 Binary files a/vignettes/example1/ATLAS_T.PNG and /dev/null differ diff --git a/vignettes/example1/ATLAS_T.webp b/vignettes/example1/ATLAS_T.webp new file mode 100644 index 000000000..df3a8245f Binary files /dev/null and b/vignettes/example1/ATLAS_T.webp differ diff --git a/vignettes/example2/aceinhibitors.png b/vignettes/example2/aceinhibitors.png deleted file mode 100644 index ce5148f1d..000000000 Binary files a/vignettes/example2/aceinhibitors.png and /dev/null differ diff --git a/vignettes/example2/aceinhibitors.webp b/vignettes/example2/aceinhibitors.webp new file mode 100644 index 000000000..564f8af77 Binary files /dev/null and b/vignettes/example2/aceinhibitors.webp differ diff --git a/vignettes/example2/angioedema.png b/vignettes/example2/angioedema.png deleted file mode 100644 index 3adc8dcc9..000000000 Binary files a/vignettes/example2/angioedema.png and /dev/null differ diff --git a/vignettes/example2/angioedema.webp b/vignettes/example2/angioedema.webp new file mode 100644 index 000000000..8c728ce50 Binary files /dev/null and b/vignettes/example2/angioedema.webp differ diff --git a/vignettes/generalizability.png b/vignettes/generalizability.png deleted file mode 100644 index b476ea71f..000000000 Binary files a/vignettes/generalizability.png and /dev/null differ diff --git a/vignettes/generalizability.webp b/vignettes/generalizability.webp new file mode 100644 index 000000000..ba6d14de4 Binary files /dev/null and b/vignettes/generalizability.webp differ diff --git a/vignettes/lstm_last.png b/vignettes/lstm_last.png deleted file mode 100644 index 3e6fc16e5..000000000 Binary files a/vignettes/lstm_last.png and /dev/null differ diff --git a/vignettes/popdef1.png b/vignettes/popdef1.png deleted file mode 100644 index 3d654fe7d..000000000 Binary files a/vignettes/popdef1.png and /dev/null differ diff --git a/vignettes/popdef1.webp b/vignettes/popdef1.webp new file mode 100644 index 000000000..83ef7afd6 Binary files /dev/null and b/vignettes/popdef1.webp differ diff --git a/vignettes/popdef2.png b/vignettes/popdef2.png deleted file mode 100644 index a596e188d..000000000 Binary files a/vignettes/popdef2.png and /dev/null differ diff --git a/vignettes/popdef2.webp b/vignettes/popdef2.webp new file mode 100644 index 000000000..31887dd1b Binary files /dev/null and b/vignettes/popdef2.webp differ diff --git a/vignettes/popdef3.png b/vignettes/popdef3.png deleted file mode 100644 index 34527ef9f..000000000 Binary files a/vignettes/popdef3.png and /dev/null differ diff --git a/vignettes/popdef3.webp b/vignettes/popdef3.webp new file mode 100644 index 000000000..8b409ed49 Binary files /dev/null and b/vignettes/popdef3.webp differ diff --git a/vignettes/popdef4.png b/vignettes/popdef4.png deleted file mode 100644 index 35d4949a5..000000000 Binary files a/vignettes/popdef4.png and /dev/null differ diff --git a/vignettes/popdef4.webp b/vignettes/popdef4.webp new file mode 100644 index 000000000..2709497e7 Binary files /dev/null and b/vignettes/popdef4.webp differ diff --git a/vignettes/popdef5.png b/vignettes/popdef5.png deleted file mode 100644 index f6315b8a8..000000000 Binary files a/vignettes/popdef5.png and /dev/null differ diff --git a/vignettes/popdef5.webp b/vignettes/popdef5.webp new file mode 100644 index 000000000..748b8901b Binary files /dev/null and b/vignettes/popdef5.webp differ diff --git a/vignettes/popdef6.png b/vignettes/popdef6.png deleted file mode 100644 index 96a8abd1f..000000000 Binary files a/vignettes/popdef6.png and /dev/null differ diff --git a/vignettes/popdef6.webp b/vignettes/popdef6.webp new file mode 100644 index 000000000..583dc9fba Binary files /dev/null and b/vignettes/popdef6.webp differ diff --git a/vignettes/precisionRecall.png b/vignettes/precisionRecall.png deleted file mode 100644 index 1f1d0f154..000000000 Binary files a/vignettes/precisionRecall.png and /dev/null differ diff --git a/vignettes/precisionRecall.webp b/vignettes/precisionRecall.webp new file mode 100644 index 000000000..af6b0cfe5 Binary files /dev/null and b/vignettes/precisionRecall.webp differ diff --git a/vignettes/predictionDistribution.png b/vignettes/predictionDistribution.png deleted file mode 100644 index 87bafc361..000000000 Binary files a/vignettes/predictionDistribution.png and /dev/null differ diff --git a/vignettes/predictionDistribution.webp b/vignettes/predictionDistribution.webp new file mode 100644 index 000000000..c7756d788 Binary files /dev/null and b/vignettes/predictionDistribution.webp differ diff --git a/vignettes/preferencePDF.png b/vignettes/preferencePDF.png deleted file mode 100644 index 3b3528452..000000000 Binary files a/vignettes/preferencePDF.png and /dev/null differ diff --git a/vignettes/preferencePDF.webp b/vignettes/preferencePDF.webp new file mode 100644 index 000000000..189a356be Binary files /dev/null and b/vignettes/preferencePDF.webp differ diff --git a/vignettes/problems.png b/vignettes/problems.png deleted file mode 100644 index 931efa6d6..000000000 Binary files a/vignettes/problems.png and /dev/null differ diff --git a/vignettes/problems.webp b/vignettes/problems.webp new file mode 100644 index 000000000..5c1c27bb4 Binary files /dev/null and b/vignettes/problems.webp differ diff --git a/vignettes/shinyroc.PNG b/vignettes/shinyroc.PNG deleted file mode 100644 index 579fab31f..000000000 Binary files a/vignettes/shinyroc.PNG and /dev/null differ diff --git a/vignettes/shinyroc.webp b/vignettes/shinyroc.webp new file mode 100644 index 000000000..a11724623 Binary files /dev/null and b/vignettes/shinyroc.webp differ diff --git a/vignettes/shinysummary.PNG b/vignettes/shinysummary.PNG deleted file mode 100644 index 75cec2430..000000000 Binary files a/vignettes/shinysummary.PNG and /dev/null differ diff --git a/vignettes/shinysummary.webp b/vignettes/shinysummary.webp new file mode 100644 index 000000000..0d256ade1 Binary files /dev/null and b/vignettes/shinysummary.webp differ diff --git a/vignettes/sparseCalibration.png b/vignettes/sparseCalibration.png deleted file mode 100644 index d6e34c0cf..000000000 Binary files a/vignettes/sparseCalibration.png and /dev/null differ diff --git a/vignettes/sparseCalibration.webp b/vignettes/sparseCalibration.webp new file mode 100644 index 000000000..043019e5b Binary files /dev/null and b/vignettes/sparseCalibration.webp differ diff --git a/vignettes/sparseROC.png b/vignettes/sparseROC.png deleted file mode 100644 index 8a4b13cec..000000000 Binary files a/vignettes/sparseROC.png and /dev/null differ diff --git a/vignettes/sparseROC.webp b/vignettes/sparseROC.webp new file mode 100644 index 000000000..2ea3ea56f Binary files /dev/null and b/vignettes/sparseROC.webp differ diff --git a/vignettes/studydesign.png b/vignettes/studydesign.png deleted file mode 100644 index 453f4aadd..000000000 Binary files a/vignettes/studydesign.png and /dev/null differ diff --git a/vignettes/studydesign.webp b/vignettes/studydesign.webp new file mode 100644 index 000000000..28717c7d2 Binary files /dev/null and b/vignettes/studydesign.webp differ diff --git a/vignettes/variableScatterplot.png b/vignettes/variableScatterplot.png deleted file mode 100644 index bdcf0df4a..000000000 Binary files a/vignettes/variableScatterplot.png and /dev/null differ diff --git a/vignettes/variableScatterplot.webp b/vignettes/variableScatterplot.webp new file mode 100644 index 000000000..de6f8999d Binary files /dev/null and b/vignettes/variableScatterplot.webp differ