From 0838e950558b872702129094add0800731cb1f93 Mon Sep 17 00:00:00 2001 From: Joshua Ploshay Date: Thu, 10 Aug 2023 19:11:30 -0400 Subject: [PATCH 1/8] Add the down_scale function to pecan. --- .../assim.sequential/R/downscale_function.R | 93 +++++++++++++++++++ modules/assim.sequential/man/NA_downscale.Rd | 31 +++++++ 2 files changed, 124 insertions(+) create mode 100644 modules/assim.sequential/R/downscale_function.R create mode 100644 modules/assim.sequential/man/NA_downscale.Rd diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R new file mode 100644 index 00000000000..42dd3e2102f --- /dev/null +++ b/modules/assim.sequential/R/downscale_function.R @@ -0,0 +1,93 @@ +##' @title North America Downscale Function +##' @name NA_downscale +##' @author Joshua Ploshay +##' +##' @param data In quotes, file path for .rds containing ensemble data. +##' @param focus_year In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'. +##' @param C_pool In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'. +##' @param covariates: In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. +##' @param cords: In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat". +##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations +##' +##' @description This function uses the randomForest model. +##' +##' @return It returns the `downscale_output` list containing lists for the training and testing data sets, models, and predicted maps for each ensemble member. + + +NA_downscale <- function(data, cords, covariates, focus_year, C_pool){ + + # Read in the covariates and set CRS to EPSG:4326 + covariates <- terra::rast(covariates) # ADD package to every function + terra::crs(covariates) <- "EPSG:4326" + + # Read the input data and site coordinates + input_data <- readRDS(data) + site_coordinates <- terra::vect(readr::read_csv(cords), geom=c("lon", "lat"), crs="EPSG:4326") + + # Extract the carbon data for the specified focus year + index <- which(names(input_data) == focus_year) + data <- input_data[[index]] + carbon_data <- as.data.frame(t(data[which(names(data) == C_pool)])) + names(carbon_data) <- paste0("ensemble",seq(1:ncol(carbon_data))) + + # Extract predictors from covariates raster using site coordinates + predictors <- as.data.frame(terra::extract(covariates, site_coordinates)) + predictors <- dplyr::select(predictors, -1) + + # Combine each ensemble member with all predictors + ensembles <- list() + for (i in seq_along(carbon_data)) { + ensembles[[i]] <- cbind(carbon_data[[i]], predictors) + } + + # Rename the carbon_data column for each ensemble member + for (i in 1:length(ensembles)) { + ensembles[[i]] <- dplyr::rename(ensembles[[i]], "carbon_data" = "carbon_data[[i]]") + } + + # Split the observations in each data frame into two data frames based on the proportion of 3/4 + ensembles <- lapply(ensembles, function(df) { + sample <- sample(1:nrow(df), size = round(0.75*nrow(df))) + train <- df[sample, ] + test <- df[-sample, ] + split_list <- list(train, test) + return(split_list) + }) + + # Rename the training and testing data frames for each ensemble member + for (i in 1:length(ensembles)) { + # names(ensembles) <- paste0("ensemble",seq(1:length(ensembles))) + names(ensembles[[i]]) <- c("training", "testing") + } + + # Train a random forest model for each ensemble member using the training data + output <- list() + for (i in 1:length(ensembles)) { + output[[i]] <- randomForest::randomForest(ensembles[[i]][[1]][["carbon_data"]] ~ land_cover+tavg+prec+srad+vapr+nitrogen+phh2o+soc+sand, + data = ensembles[[i]][[1]], + ntree = 1000, + na.action = na.omit, + keep.forest = T, + importance = T) + } + + # Generate predictions (maps) for each ensemble member using the trained models + maps <- list(ncol(output)) + for (i in 1:length(output)) { + maps[[i]] <- terra::predict(object = covariates, + model = output[[i]],na.rm = T) + } + + # Organize the results into a single output list + downscale_output <- list(ensembles, output, maps) + + # Rename each element of the output list with appropriate ensemble numbers + for (i in 1:length(downscale_output)) { + names(downscale_output[[i]]) <- paste0("ensemble",seq(1:length(downscale_output[[i]]))) + } + + # Rename the main components of the output list + names(downscale_output) <- c("data", "models", "maps") + + return(downscale_output) +} diff --git a/modules/assim.sequential/man/NA_downscale.Rd b/modules/assim.sequential/man/NA_downscale.Rd new file mode 100644 index 00000000000..5dabf4738d2 --- /dev/null +++ b/modules/assim.sequential/man/NA_downscale.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/downscale_function.R +\name{NA_downscale} +\alias{NA_downscale} +\title{North America Downscale Function} +\usage{ +NA_downscale(data, cords, covariates, focus_year, C_pool) +} +\arguments{ +\item{data}{In quotes, file path for .rds containing ensemble data.} + +\item{focus_year}{In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'.} + +\item{C_pool}{In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'.} + +\item{covariates:}{In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named.} + +\item{cords:}{In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat".} +} +\value{ +It returns the `downscale_output` list containing lists for the training and testing data sets, models, and predicted maps for each ensemble member. +} +\description{ +This function uses the randomForest model. +} +\details{ +This function will downscale forecast data to unmodeled locations using covariates and site locations +} +\author{ +Joshua Ploshay +} From 867b88a546b2020ee807cf39d088cd88a0ef44af Mon Sep 17 00:00:00 2001 From: Joshua Ploshay Date: Wed, 28 Feb 2024 15:41:28 -0500 Subject: [PATCH 2/8] updated documentation to include suggested packages --- docker/depends/pecan.depends.R | 2 ++ modules/assim.sequential/DESCRIPTION | 3 +++ 2 files changed, 5 insertions(+) diff --git a/docker/depends/pecan.depends.R b/docker/depends/pecan.depends.R index a4f2e489a4e..e49922d1ef6 100644 --- a/docker/depends/pecan.depends.R +++ b/docker/depends/pecan.depends.R @@ -104,9 +104,11 @@ wanted <- c( 'purrr', 'pwr', 'R.utils', +'randomForest', 'randtoolbox', 'raster', 'rcrossref', +'readr', 'REddyProc', 'redland', 'reshape', diff --git a/modules/assim.sequential/DESCRIPTION b/modules/assim.sequential/DESCRIPTION index acb9d7aff85..6c2e7052a73 100644 --- a/modules/assim.sequential/DESCRIPTION +++ b/modules/assim.sequential/DESCRIPTION @@ -45,11 +45,14 @@ Suggests: PEcAn.data.remote, plotrix, plyr (>= 1.8.4), + randomForest, raster, + readr, reshape2 (>= 1.4.2), rlist, sf, stats, + terra, testthat, tictoc, tidyr, From 5ae30363c1c1832805bb48b7971163eb4ac1ef36 Mon Sep 17 00:00:00 2001 From: Joshua Ploshay Date: Wed, 28 Feb 2024 15:53:34 -0500 Subject: [PATCH 3/8] resolve conflicts and update dependencies --- docker/depends/pecan.depends.R | 188 +++--------------- docker/depends/pecan_package_dependencies.csv | 3 + 2 files changed, 28 insertions(+), 163 deletions(-) diff --git a/docker/depends/pecan.depends.R b/docker/depends/pecan.depends.R index b03d95e3433..0f3057d896d 100644 --- a/docker/depends/pecan.depends.R +++ b/docker/depends/pecan.depends.R @@ -6,142 +6,6 @@ rlib <- Sys.getenv('R_LIBS_USER', '/usr/local/lib/R/site-library') Sys.setenv(RLIB = rlib) -<<<<<<< HEAD -# install all packages (depends, imports, suggests) -wanted <- c( -'abind', -'amerifluxr', -'arrow', -'assertthat', -'BayesianTools', -'BioCro', -'bit64', -'BrownDog', -'coda', -'corrplot', -'curl', -'data.table', -'dataone', -'datapack', -'DBI', -'dbplyr', -'devtools', -'doParallel', -'doSNOW', -'dplR', -'dplyr', -'ellipse', -'emdbook', -'foreach', -'fs', -'furrr', -'future', -'geonames', -'getPass', -'ggmap', -'ggmcmc', -'ggplot2', -'ggpubr', -'ggrepel', -'glue', -'graphics', -'grDevices', -'grid', -'gridExtra', -'hdf5r', -'here', -'httr', -'IDPmisc', -'jsonlite', -'knitr', -'lattice', -'linkages', -'lqmm', -'lubridate', -'Maeswrap', -'magic', -'magrittr', -'maps', -'maptools', -'markdown', -'MASS', -'Matrix', -'mclust', -'MCMCpack', -'methods', -'mgcv', -'minpack.lm', -'mlegp', -'mockery', -'MODISTools', -'mvbutils', -'mvtnorm', -'ncdf4', -'neonstore', -'neonUtilities', -'nimble', -'nneo', -'optparse', -'parallel', -'plotrix', -'plyr', -'png', -'prodlim', -'progress', -'purrr', -'pwr', -'R.utils', -'randomForest', -'randtoolbox', -'raster', -'rcrossref', -'readr', -'REddyProc', -'redland', -'reshape', -'reshape2', -'reticulate', -'rgdal', -'rjags', -'rjson', -'rlang', -'rlist', -'rmarkdown', -'RPostgres', -'RPostgreSQL', -'Rpreles', -'RSQLite', -'sf', -'SimilarityMeasures', -'sirt', -'sp', -'stats', -'stringi', -'stringr', -'swfscMisc', -'terra', -'testthat', -'tibble', -'tictoc', -'tidyr', -'tidyselect', -'tidyverse', -'tools', -'traits', -'TruncatedNormal', -'truncnorm', -'units', -'urltools', -'utils', -'vdiffr', -'withr', -'XML', -'xtable', -'xts', -'zoo' -) -missing <- wanted[!(wanted %in% installed.packages()[,'Package'])] -======= # Find the latest of several possible minimum package versions condense_version_requirements <- function(specs) { if (all(specs == "*")) { @@ -151,7 +15,7 @@ condense_version_requirements <- function(specs) { specs <- unique(specs[specs != "*"]) versions <- package_version( gsub("[^[:digit:].-]+", "", specs)) - + if ((length(unique(versions)) > 1) && any(!grepl(">", specs))) { # Can't assume the latest version works for all, so give up. # We *could* write more to handle this case if needed, but it seems very rare: @@ -175,30 +39,30 @@ condense_version_requirements <- function(specs) { # Install or newer, # upgrading dependencies only if needed to satisfy stated version requirements ensure_version <- function(pkg, version) { - vers <- gsub('[^[:digit:].-]+', '', version) - cmp <- get(gsub('[^<>=]+', '', version)) - ok <- requireNamespace(pkg, quietly = TRUE) && - cmp(packageVersion(pkg), vers) - if (!ok) { - # install pkg and any *missing* dependencies - remotes::install_version(pkg, version, dependencies = TRUE, upgrade = FALSE) - # Now check for installed but *incompatible* dependencies - # (install_version doesn't resolve these when upgrade=FALSE) - dep <- desc::desc_get_deps(system.file("DESCRIPTION", package = pkg)) - dep <- dep[ - dep$type %in% c("Depends", "Imports", "LinkingTo") - & dep$version != "*" - & dep$package != "R",] - invisible(Map(ensure_version, dep$package, dep$version)) - } - + vers <- gsub('[^[:digit:].-]+', '', version) + cmp <- get(gsub('[^<>=]+', '', version)) + ok <- requireNamespace(pkg, quietly = TRUE) && + cmp(packageVersion(pkg), vers) + if (!ok) { + # install pkg and any *missing* dependencies + remotes::install_version(pkg, version, dependencies = TRUE, upgrade = FALSE) + # Now check for installed but *incompatible* dependencies + # (install_version doesn't resolve these when upgrade=FALSE) + dep <- desc::desc_get_deps(system.file("DESCRIPTION", package = pkg)) + dep <- dep[ + dep$type %in% c("Depends", "Imports", "LinkingTo") + & dep$version != "*" + & dep$package != "R",] + invisible(Map(ensure_version, dep$package, dep$version)) + } + } # Read list of dependencies. # NOTE: These files are autogenerated -- # use scripts/generate_dependencies.R to edit them. all_deps <- read.csv("pecan_package_dependencies.csv") |> - subset(!is_pecan) + subset(!is_pecan) gh_repos <- readLines("pecan_deps_from_github.txt") @@ -210,16 +74,15 @@ remotes::install_github(gh_repos, lib = rlib) # For deps used by multiple packages, find a version that works for all uniq_deps <- tapply( - all_deps$version, - INDEX = all_deps$package, - FUN = condense_version_requirements) + all_deps$version, + INDEX = all_deps$package, + FUN = condense_version_requirements) # Install deps that declare no version restriction. # We'll install these with one plain old `install.packages()` call. unversioned <- names(uniq_deps[uniq_deps == "*"]) missing <- unversioned[!(unversioned %in% installed.packages()[,'Package'])] ->>>>>>> 48a3b259fd03512c145b4d6eb387467e56ab49b4 install.packages(missing, lib = rlib) @@ -229,9 +92,8 @@ install.packages(missing, lib = rlib) # it can't fill the version req from snapshot versions. # (Assumes our CRAN uses the same URL scheme as Posit package manager) options(repos = c( - getOption('repos'), - sub(r'(\d{4}-\d{2}-\d{2})', 'latest', getOption('repos')) + getOption('repos'), + sub(r'(\d{4}-\d{2}-\d{2})', 'latest', getOption('repos')) )) versioned <- uniq_deps[uniq_deps != "*"] -invisible(Map(ensure_version, names(versioned), versioned)) - +invisible(Map(ensure_version, names(versioned), versioned)) \ No newline at end of file diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index 8345db18103..eb6b8e0f47a 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -438,6 +438,7 @@ "purrr",">= 0.2.3","modules/data.atmosphere","Imports",FALSE "pwr","*","modules/rtm","Suggests",FALSE "R.utils","*","base/db","Imports",FALSE +"randomForest","*","modules/assim.sequential","Suggests",FALSE "randtoolbox","*","base/utils","Suggests",FALSE "randtoolbox","*","modules/uncertainty","Imports",FALSE "raster","*","base/visualization","Suggests",FALSE @@ -446,6 +447,7 @@ "raster","*","modules/data.land","Suggests",FALSE "raster","*","modules/data.remote","Suggests",FALSE "rcrossref","*","base/db","Suggests",FALSE +"readr","*","modules/assim.sequential","Suggests",FALSE "REddyProc","*","modules/data.atmosphere","Imports",FALSE "redland","*","modules/data.land","Suggests",FALSE "reshape","*","modules/data.remote","Suggests",FALSE @@ -554,6 +556,7 @@ "stringr",">= 1.1.0","modules/data.atmosphere","Imports",FALSE "suntools","*","modules/data.atmosphere","Imports",FALSE "swfscMisc","*","modules/data.land","Imports",FALSE +"terra","*","modules/assim.sequential","Suggests",FALSE "terra","*","modules/data.atmosphere","Imports",FALSE "terra","*","modules/data.land","Imports",FALSE "terra","*","modules/data.remote","Imports",FALSE From fe0aefeb7837cadc879779105e0382ffd14a21b4 Mon Sep 17 00:00:00 2001 From: Michael Dietze Date: Tue, 5 Mar 2024 09:35:21 -0500 Subject: [PATCH 4/8] Update modules/assim.sequential/R/downscale_function.R --- modules/assim.sequential/R/downscale_function.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R index 42dd3e2102f..ce2428fd354 100644 --- a/modules/assim.sequential/R/downscale_function.R +++ b/modules/assim.sequential/R/downscale_function.R @@ -6,7 +6,7 @@ ##' @param focus_year In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'. ##' @param C_pool In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'. ##' @param covariates: In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. -##' @param cords: In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat". +##' @param cords In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat". ##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations ##' ##' @description This function uses the randomForest model. From 2e3d83293bfe7087db5cdb9a66de7b1c9fd4f156 Mon Sep 17 00:00:00 2001 From: Michael Dietze Date: Tue, 5 Mar 2024 09:35:35 -0500 Subject: [PATCH 5/8] Update modules/assim.sequential/R/downscale_function.R --- modules/assim.sequential/R/downscale_function.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R index ce2428fd354..54551c1ee83 100644 --- a/modules/assim.sequential/R/downscale_function.R +++ b/modules/assim.sequential/R/downscale_function.R @@ -5,7 +5,7 @@ ##' @param data In quotes, file path for .rds containing ensemble data. ##' @param focus_year In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'. ##' @param C_pool In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'. -##' @param covariates: In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. +##' @param covariates In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. ##' @param cords In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat". ##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations ##' From 1daf41bbb46f9a522144fbb3bd62542810e27f06 Mon Sep 17 00:00:00 2001 From: Michael Dietze Date: Thu, 7 Mar 2024 11:07:06 -0500 Subject: [PATCH 6/8] Update modules/assim.sequential/R/downscale_function.R --- modules/assim.sequential/R/downscale_function.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R index 54551c1ee83..4a64beef645 100644 --- a/modules/assim.sequential/R/downscale_function.R +++ b/modules/assim.sequential/R/downscale_function.R @@ -66,7 +66,7 @@ NA_downscale <- function(data, cords, covariates, focus_year, C_pool){ output[[i]] <- randomForest::randomForest(ensembles[[i]][[1]][["carbon_data"]] ~ land_cover+tavg+prec+srad+vapr+nitrogen+phh2o+soc+sand, data = ensembles[[i]][[1]], ntree = 1000, - na.action = na.omit, + na.action = stats::na.omit, keep.forest = T, importance = T) } From 4d032e844c27fef2862e6452ba60845c65b8ae8a Mon Sep 17 00:00:00 2001 From: Michael Dietze Date: Thu, 7 Mar 2024 11:07:19 -0500 Subject: [PATCH 7/8] Update modules/assim.sequential/man/NA_downscale.Rd --- modules/assim.sequential/man/NA_downscale.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/assim.sequential/man/NA_downscale.Rd b/modules/assim.sequential/man/NA_downscale.Rd index 5dabf4738d2..979229d30b2 100644 --- a/modules/assim.sequential/man/NA_downscale.Rd +++ b/modules/assim.sequential/man/NA_downscale.Rd @@ -13,7 +13,7 @@ NA_downscale(data, cords, covariates, focus_year, C_pool) \item{C_pool}{In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'.} -\item{covariates:}{In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named.} +\item{covariates}{In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named.} \item{cords:}{In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat".} } From e98c4613e22e34a7d607bed5b2a410d81b57865d Mon Sep 17 00:00:00 2001 From: Michael Dietze Date: Thu, 7 Mar 2024 11:07:26 -0500 Subject: [PATCH 8/8] Update modules/assim.sequential/man/NA_downscale.Rd --- modules/assim.sequential/man/NA_downscale.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/assim.sequential/man/NA_downscale.Rd b/modules/assim.sequential/man/NA_downscale.Rd index 979229d30b2..8ab6a5ea946 100644 --- a/modules/assim.sequential/man/NA_downscale.Rd +++ b/modules/assim.sequential/man/NA_downscale.Rd @@ -15,7 +15,7 @@ NA_downscale(data, cords, covariates, focus_year, C_pool) \item{covariates}{In quotes, file path of SpatRaster stack, used as predictors in randomForest. Layers within stack should be named.} -\item{cords:}{In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat".} +\item{cords}{In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat".} } \value{ It returns the `downscale_output` list containing lists for the training and testing data sets, models, and predicted maps for each ensemble member.