diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index 259c6c7508c..fe66050ff8c 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -122,6 +122,7 @@ "jsonlite","*","models/stics","Imports",FALSE "jsonlite","*","modules/data.atmosphere","Imports",FALSE "jsonlite","*","modules/data.remote","Suggests",FALSE +"keras3",">= 1.0.0","modules/assim.sequential","Suggests",FALSE "knitr","*","base/visualization","Suggests",FALSE "knitr","*","modules/data.atmosphere","Suggests",FALSE "knitr",">= 1.42","base/db","Suggests",FALSE diff --git a/modules/assim.sequential/DESCRIPTION b/modules/assim.sequential/DESCRIPTION index 8c9721d2483..dfa454384cd 100644 --- a/modules/assim.sequential/DESCRIPTION +++ b/modules/assim.sequential/DESCRIPTION @@ -48,6 +48,7 @@ Suggests: plotrix, plyr (>= 1.8.4), randomForest, + keras3 (>= 1.0.0), raster, readr, reshape2 (>= 1.4.2), diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R index 389fe849776..30280dab8d9 100644 --- a/modules/assim.sequential/R/downscale_function.R +++ b/modules/assim.sequential/R/downscale_function.R @@ -1,88 +1,237 @@ -##' @title North America Downscale Function -##' @name NA_downscale -##' @author Joshua Ploshay +##' @title Preprocess Data for Downscaling +##' @name SDA_downscale_preprocess +##' @author Sambhav Dixit ##' -##' @param data In quotes, file path for .rds containing ensemble data. -##' @param coords In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat". -##' @param date In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'. -##' @param C_pool In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'. -##' @param covariates SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder -##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations +##' @param data_path Character. File path for .rds containing ensemble data. +##' @param coords_path Character. File path for .csv file containing the site coordinates, with columns named "lon" and "lat". +##' @param date Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within the file supplied to 'data_path'. +##' @param carbon_pool Character. Carbon pool of interest. Name must match the carbon pool name found within the file supplied to 'data_path'. +##' @details This function ensures that the specified date and carbon pool are present in the input data. It also checks the validity of the site coordinates and aligns the number of rows between site coordinates and carbon data. ##' -##' @description This function uses the randomForest model. +##' @description This function reads and checks the input data, ensuring that the required date and carbon pool exist, and that the site coordinates are valid. ##' -##' @return It returns the `downscale_output` list containing lists for the training and testing data sets, models, and predicted maps for each ensemble member. - +##' @return A list containing The read .rds data , The cleaned site coordinates, and the preprocessed carbon data. -NA_downscale <- function(data, coords, date, C_pool, covariates){ - +SDA_downscale_preprocess <- function(data_path, coords_path, date, carbon_pool) { # Read the input data and site coordinates - input_data <- readRDS(data) - site_coordinates <- terra::vect(readr::read_csv(coords), geom=c("lon", "lat"), crs="EPSG:4326") + input_data <- readRDS(data_path) + site_coordinates <- readr::read_csv(coords_path) + + # Convert input_data names to Date objects + input_date_names <- lubridate::ymd(names(input_data)) + names(input_data) <- input_date_names + + # Convert the input date to a Date object + standard_date <- lubridate::ymd(date) + + # Ensure the date exists in the input data + if (!standard_date %in% input_date_names) { + stop(paste("Date", date, "not found in the input data.")) + } # Extract the carbon data for the specified focus year - index <- which(names(input_data) == date) + index <- which(input_date_names == standard_date) data <- input_data[[index]] - carbon_data <- as.data.frame(t(data[which(names(data) == C_pool)])) - names(carbon_data) <- paste0("ensemble",seq(1:ncol(carbon_data))) - - # Extract predictors from covariates raster using site coordinates - predictors <- as.data.frame(terra::extract(covariates, site_coordinates,ID = FALSE)) - # Combine each ensemble member with all predictors - ensembles <- list() - for (i in seq_along(carbon_data)) { - ensembles[[i]] <- cbind(carbon_data[[i]], predictors) + # Ensure the carbon pool exists in the input data + if (!carbon_pool %in% names(data)) { + stop(paste("Carbon pool", carbon_pool, "not found in the input data.")) } - # Rename the carbon_data column for each ensemble member - for (i in 1:length(ensembles)) { - ensembles[[i]] <- dplyr::rename(ensembles[[i]], "carbon_data" = "carbon_data[[i]]") + carbon_data <- as.data.frame(t(data[which(names(data) == carbon_pool)])) + names(carbon_data) <- paste0("ensemble", seq(ncol(carbon_data))) + + # Ensure site coordinates have 'lon' and 'lat' columns + if (!all(c("lon", "lat") %in% names(site_coordinates))) { + stop("Site coordinates must contain 'lon' and 'lat' columns.") } - # Split the observations in each data frame into two data frames based on the proportion of 3/4 - ensembles <- lapply(ensembles, function(df) { - sample <- sample(1:nrow(df), size = round(0.75*nrow(df))) - train <- df[sample, ] - test <- df[-sample, ] - split_list <- list(train, test) - return(split_list) - }) - - # Rename the training and testing data frames for each ensemble member - for (i in 1:length(ensembles)) { - # names(ensembles) <- paste0("ensemble",seq(1:length(ensembles))) - names(ensembles[[i]]) <- c("training", "testing") + # Ensure the number of rows in site coordinates matches the number of rows in carbon data + if (nrow(site_coordinates) != nrow(carbon_data)) { + message("Number of rows in site coordinates does not match the number of rows in carbon data.") + if (nrow(site_coordinates) > nrow(carbon_data)) { + message("Truncating site coordinates to match carbon data rows.") + site_coordinates <- site_coordinates[1:nrow(carbon_data), ] + } else { + message("Truncating carbon data to match site coordinates rows.") + carbon_data <- carbon_data[1:nrow(site_coordinates), ] + } } - # Train a random forest model for each ensemble member using the training data - rf_output <- list() - for (i in 1:length(ensembles)) { - rf_output[[i]] <- randomForest::randomForest(ensembles[[i]][[1]][["carbon_data"]] ~ land_cover+tavg+prec+srad+vapr+nitrogen+phh2o+soc+sand, - data = ensembles[[i]][[1]], - ntree = 1000, - na.action = stats::na.omit, - keep.forest = T, - importance = T) + message("Preprocessing completed successfully.") + return(list(input_data = input_data, site_coordinates = site_coordinates, carbon_data = carbon_data)) +} + +##' @title SDA Downscale Function +##' @name SDA_downscale +##' @author Joshua Ploshay, Sambhav Dixit +##' +##' @param preprocessed List. Preprocessed data returned as an output from the SDA_downscale_preprocess function. +##' @param date Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'preprocessed' from the 'data_path'. +##' @param carbon_pool Character. Carbon pool of interest. Name must match carbon pool name found within file supplied to 'preprocessed' from the 'data_path'. +##' @param covariates SpatRaster stack. Used as predictors in downscaling. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder +##' @param model_type Character. Either "rf" for Random Forest or "cnn" for Convolutional Neural Network. Default is Random Forest. +##' @param seed Numeric or NULL. Optional seed for random number generation. Default is NULL. +##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations +##' +##' @description This function uses either Random Forest or Convolutional Neural Network model based on the model_type parameter. +##' +##' @return A list containing the training and testing data sets, models, predicted maps for each ensemble member, and predictions for testing data. + +SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_type = "rf", seed = NULL) { + carbon_data <- preprocessed$carbon_data + + # Convert site coordinates to SpatVector + site_coordinates <- terra::vect(preprocessed$site_coordinates, geom = c("lon", "lat"), crs = "EPSG:4326") + + # Extract predictors from covariates raster using site coordinates + predictors <- as.data.frame(terra::extract(covariates, site_coordinates, ID = FALSE)) + + # Dynamically get covariate names + covariate_names <- names(predictors) + + # Create a single data frame with all predictors and ensemble data + full_data <- cbind(carbon_data, predictors) + + # Split the observations into training and testing sets + if (!is.null(seed)) { + set.seed(seed) # Only set seed if provided } + sample <- sample(1:nrow(full_data), size = round(0.75 * nrow(full_data))) + train_data <- full_data[sample, ] + test_data <- full_data[-sample, ] + + # Prepare data for both RF and CNN + x_data <- as.matrix(full_data[, covariate_names]) + y_data <- as.matrix(carbon_data) + + # Calculate scaling parameters from all data + scaling_params <- list( + mean = colMeans(x_data), + sd = apply(x_data, 2, stats::sd) + ) + + # Normalize the data + x_data_scaled <- scale(x_data, center = scaling_params$mean, scale = scaling_params$sd) - # Generate predictions (maps) for each ensemble member using the trained models - maps <- list(ncol(rf_output)) - for (i in 1:length(rf_output)) { - maps[[i]] <- terra::predict(object = covariates, - model = rf_output[[i]],na.rm = T) + # Split into training and testing sets + x_train <- x_data_scaled[sample, ] + x_test <- x_data_scaled[-sample, ] + y_train <- y_data[sample, ] + y_test <- y_data[-sample, ] + + # Initialize lists for outputs + models <- list() + maps <- list() + predictions <- list() + + if (model_type == "rf") { + for (i in seq_along(carbon_data)) { + ensemble_col <- paste0("ensemble", i) + formula <- stats::as.formula(paste(ensemble_col, "~", paste(covariate_names, collapse = " + "))) + models[[i]] <- randomForest::randomForest(formula, + data = train_data, + ntree = 1000, + na.action = stats::na.omit, + keep.forest = TRUE, + importance = TRUE) + + maps[[i]] <- terra::predict(covariates, model = models[[i]], na.rm = TRUE) + predictions[[i]] <- stats::predict(models[[i]], test_data) + } + } else if (model_type == "cnn") { + x_train <- keras3::array_reshape(x_train, c(nrow(x_train), 1, ncol(x_train))) + x_test <- keras3::array_reshape(x_test, c(nrow(x_test), 1, ncol(x_test))) + + for (i in seq_along(carbon_data)) { + model <- keras3::keras_model_sequential() |> + keras3::layer_conv_1d(filters = 64, kernel_size = 1, activation = 'relu', input_shape = c(1, length(covariate_names))) |> + keras3::layer_flatten() |> + keras3::layer_dense(units = 64, activation = 'relu') |> + keras3::layer_dense(units = 1) + + model |> keras3::compile( + loss = 'mean_squared_error', + optimizer = keras3::optimizer_adam(), + metrics = c('mean_absolute_error') + ) + + model |> keras3::fit( + x = x_train, + y = y_train[, i], + epochs = 100, + batch_size = 32, + validation_split = 0.2, + verbose = 0 + ) + + models[[i]] <- model + + cnn_predict <- function(model, newdata, scaling_params) { + newdata <- scale(newdata, center = scaling_params$mean, scale = scaling_params$sd) + newdata <- keras3::array_reshape(newdata, c(nrow(newdata), 1, ncol(newdata))) + predictions <- stats::predict(model, newdata) + return(as.vector(predictions)) + } + + prediction_rast <- terra::rast(covariates) + maps[[i]] <- terra::predict(prediction_rast, model = models[[i]], + fun = cnn_predict, + scaling_params = scaling_params) + + predictions[[i]] <- cnn_predict(models[[i]], x_data[-sample, ], scaling_params) + } + } else { + stop("Invalid model_type. Please choose either 'rf' for Random Forest or 'cnn' for Convolutional Neural Network.") } # Organize the results into a single output list - downscale_output <- list(ensembles, rf_output, maps) + downscale_output <- list( + data = list(training = train_data, testing = test_data), + models = models, + maps = maps, + predictions = predictions, + scaling_params = scaling_params + ) # Rename each element of the output list with appropriate ensemble numbers - for (i in 1:length(downscale_output)) { - names(downscale_output[[i]]) <- paste0("ensemble",seq(1:length(downscale_output[[i]]))) + for (i in seq_along(carbon_data)) { + names(downscale_output$models)[i] <- paste0("ensemble", i) + names(downscale_output$maps)[i] <- paste0("ensemble", i) + names(downscale_output$predictions)[i] <- paste0("ensemble", i) } - # Rename the main components of the output list - names(downscale_output) <- c("data", "models", "maps") - return(downscale_output) } + +##' @title Calculate Metrics for Downscaling Results +##' @name SDA_downscale_metrics +##' @author Sambhav Dixit +##' +##' @param downscale_output List. Output from the SDA_downscale function, containing data, models, maps, and predictions for each ensemble. +##' @param carbon_pool Character. Name of the carbon pool used in the downscaling process. +##' +##' @details This function calculates performance metrics for the downscaling results. It computes Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared for each ensemble. The function uses the actual values from the testing data and the predictions generated during the downscaling process. +##' +##' @description This function takes the output from the SDA_downscale function and computes various performance metrics for each ensemble. It provides a way to evaluate the accuracy of the downscaling results without modifying the main downscaling function. +##' +##' @return A list of metrics for each ensemble, where each element contains MAE , MSE ,R_squared ,actual values from testing data and predicted values for the testing data + +SDA_downscale_metrics <- function(downscale_output, carbon_pool) { + metrics <- list() + + for (i in 1:length(downscale_output$data)) { + actual <- downscale_output$data[[i]]$testing[[paste0(carbon_pool, "_ens", i)]] + predicted <- downscale_output$predictions[[i]] + + mse <- mean((actual - predicted)^2) + mae <- mean(abs(actual - predicted)) + r_squared <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2) + + metrics[[i]] <- list(MSE = mse, MAE = mae, R_squared = r_squared, actual = actual, predicted = predicted) + } + + names(metrics) <- paste0("ensemble", seq_along(metrics)) + + return(metrics) +} diff --git a/modules/assim.sequential/man/NA_downscale.Rd b/modules/assim.sequential/man/NA_downscale.Rd deleted file mode 100644 index 47ce232b088..00000000000 --- a/modules/assim.sequential/man/NA_downscale.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/downscale_function.R -\name{NA_downscale} -\alias{NA_downscale} -\title{North America Downscale Function} -\usage{ -NA_downscale(data, coords, date, C_pool, covariates) -} -\arguments{ -\item{data}{In quotes, file path for .rds containing ensemble data.} - -\item{coords}{In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat".} - -\item{date}{In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'.} - -\item{C_pool}{In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'.} - -\item{covariates}{SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder} -} -\value{ -It returns the `downscale_output` list containing lists for the training and testing data sets, models, and predicted maps for each ensemble member. -} -\description{ -This function uses the randomForest model. -} -\details{ -This function will downscale forecast data to unmodeled locations using covariates and site locations -} -\author{ -Joshua Ploshay -} diff --git a/modules/assim.sequential/man/SDA_downscale.Rd b/modules/assim.sequential/man/SDA_downscale.Rd new file mode 100644 index 00000000000..89f6810866b --- /dev/null +++ b/modules/assim.sequential/man/SDA_downscale.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/downscale_function.R +\name{SDA_downscale} +\alias{SDA_downscale} +\title{SDA Downscale Function} +\usage{ +SDA_downscale( + preprocessed, + date, + carbon_pool, + covariates, + model_type = "rf", + seed = NULL +) +} +\arguments{ +\item{preprocessed}{List. Preprocessed data returned as an output from the SDA_downscale_preprocess function.} + +\item{date}{Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'preprocessed' from the 'data_path'.} + +\item{carbon_pool}{Character. Carbon pool of interest. Name must match carbon pool name found within file supplied to 'preprocessed' from the 'data_path'.} + +\item{covariates}{SpatRaster stack. Used as predictors in downscaling. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder} + +\item{model_type}{Character. Either "rf" for Random Forest or "cnn" for Convolutional Neural Network. Default is Random Forest.} + +\item{seed}{Numeric or NULL. Optional seed for random number generation. Default is NULL.} +} +\value{ +A list containing the training and testing data sets, models, predicted maps for each ensemble member, and predictions for testing data. +} +\description{ +This function uses either Random Forest or Convolutional Neural Network model based on the model_type parameter. +} +\details{ +This function will downscale forecast data to unmodeled locations using covariates and site locations +} +\author{ +Joshua Ploshay, Sambhav Dixit +} diff --git a/modules/assim.sequential/man/SDA_downscale_metrics.Rd b/modules/assim.sequential/man/SDA_downscale_metrics.Rd new file mode 100644 index 00000000000..6ee30bb0c8a --- /dev/null +++ b/modules/assim.sequential/man/SDA_downscale_metrics.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/downscale_function.R +\name{SDA_downscale_metrics} +\alias{SDA_downscale_metrics} +\title{Calculate Metrics for Downscaling Results} +\usage{ +SDA_downscale_metrics(downscale_output, carbon_pool) +} +\arguments{ +\item{downscale_output}{List. Output from the SDA_downscale function, containing data, models, maps, and predictions for each ensemble.} + +\item{carbon_pool}{Character. Name of the carbon pool used in the downscaling process.} +} +\value{ +A list of metrics for each ensemble, where each element contains MAE , MSE ,R_squared ,actual values from testing data and predicted values for the testing data +} +\description{ +This function takes the output from the SDA_downscale function and computes various performance metrics for each ensemble. It provides a way to evaluate the accuracy of the downscaling results without modifying the main downscaling function. +} +\details{ +This function calculates performance metrics for the downscaling results. It computes Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared for each ensemble. The function uses the actual values from the testing data and the predictions generated during the downscaling process. +} +\author{ +Sambhav Dixit +} diff --git a/modules/assim.sequential/man/SDA_downscale_preprocess.Rd b/modules/assim.sequential/man/SDA_downscale_preprocess.Rd new file mode 100644 index 00000000000..0e2a9f70bfe --- /dev/null +++ b/modules/assim.sequential/man/SDA_downscale_preprocess.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/downscale_function.R +\name{SDA_downscale_preprocess} +\alias{SDA_downscale_preprocess} +\title{Preprocess Data for Downscaling} +\usage{ +SDA_downscale_preprocess(data_path, coords_path, date, carbon_pool) +} +\arguments{ +\item{data_path}{Character. File path for .rds containing ensemble data.} + +\item{coords_path}{Character. File path for .csv file containing the site coordinates, with columns named "lon" and "lat".} + +\item{date}{Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within the file supplied to 'data_path'.} + +\item{carbon_pool}{Character. Carbon pool of interest. Name must match the carbon pool name found within the file supplied to 'data_path'.} +} +\value{ +A list containing The read .rds data , The cleaned site coordinates, and the preprocessed carbon data. +} +\description{ +This function reads and checks the input data, ensuring that the required date and carbon pool exist, and that the site coordinates are valid. +} +\details{ +This function ensures that the specified date and carbon pool are present in the input data. It also checks the validity of the site coordinates and aligns the number of rows between site coordinates and carbon data. +} +\author{ +Sambhav Dixit +}