PecanProject · mdietze · Aug 2, 2024 · Jun 27, 2024 · Jun 29, 2024 · Jun 29, 2024
diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R
@@ -1,33 +1,90 @@
+##' @title Preprocess Data for Downscaling
+##' @name NA_preprocess
+##' @author Sambhav Dixit
+##'
+##' @param data_path Character. File path for .rds containing ensemble data.
+##' @param coords_path Character. File path for .csv file containing the site coordinates, with columns named "lon" and "lat".
+##' @param date Character. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within the file supplied to 'data_path'.
+##' @param C_pool Character. Carbon pool of interest. Name must match the carbon pool name found within the file supplied to 'data_path'.
+##' @details This function ensures that the specified date and carbon pool are present in the input data. It also checks the validity of the site coordinates and aligns the number of rows between site coordinates and carbon data.
+##'
+##' @description This function reads and checks the input data, ensuring that the required date and carbon pool exist, and that the site coordinates are valid.
+##'
+##' @return A list containing The read .rds data , The cleaned site coordinates ,The extracted and possibly truncated carbon data.
+
+# Preprocess function to check and clean the data
+NA_preprocess <- function(data_path, coords_path, date, C_pool) {
+  # Read the input data and site coordinates
+  input_data <- readRDS(data_path)
+  site_coordinates <- read_csv(coords_path)
+
+  # Ensure the date exists in the input data
+  if (!date %in% names(input_data)) {
+    stop(paste("Date", date, "not found in the input data."))
+  }
+
+  # Extract the carbon data for the specified focus year
+  index <- which(names(input_data) == date)
+  data <- input_data[[index]]
+
+  # Ensure the carbon pool exists in the input data
+  if (!C_pool %in% names(data)) {
+    stop(paste("Carbon pool", C_pool, "not found in the input data."))
+  }
+
+  carbon_data <- as.data.frame(t(data[which(names(data) == C_pool)]))
+  names(carbon_data) <- paste0("ensemble", seq(ncol(carbon_data)))
+
+  # Ensure site coordinates have 'lon' and 'lat' columns
+  if (!all(c("lon", "lat") %in% names(site_coordinates))) {
+    stop("Site coordinates must contain 'lon' and 'lat' columns.")
+  }
+
+  # Ensure the number of rows in site coordinates matches the number of rows in carbon data
+  if (nrow(site_coordinates) != nrow(carbon_data)) {
+    message("Number of rows in site coordinates does not match the number of rows in carbon data.")
+    if (nrow(site_coordinates) > nrow(carbon_data)) {
+      message("Truncating site coordinates to match carbon data rows.")
+      site_coordinates <- site_coordinates[1:nrow(carbon_data), ]
+    } else {
+      message("Truncating carbon data to match site coordinates rows.")
+      carbon_data <- carbon_data[1:nrow(site_coordinates), ]
+    }
+  }
+
+  message("Preprocessing completed successfully.")
+  return(list(input_data = input_data, site_coordinates = site_coordinates, carbon_data = carbon_data))
+}
+
 ##' @title North America Downscale Function
 ##' @name NA_downscale
-##' @author Joshua Ploshay
+##' @author Joshua Ploshay , Sambhav Dixit
 ##'
-##' @param data  In quotes, file path for .rds containing ensemble data.
-##' @param coords In quotes, file path for .csv file containing the site coordinates, columns named "lon" and "lat".
+##' @param preprocessed , In quotes, prepocessed data returned as an output for passing the raw data to the NA_preprocess function.
 ##' @param date In quotes, if SDA site run, format is yyyy/mm/dd, if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'data'.
 ##' @param C_pool In quotes, carbon pool of interest. Name must match carbon pool name found within file supplied to 'data'.
-##' @param covariates SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder
+##' @param covariates_path SpatRaster stack, used as predictors in randomForest. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder
 ##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations
 ##'
-##' @description This function uses the randomForest model.
+##' @description This function uses the Convolutional Neural Network(CNN) model.
 ##'
 ##' @return It returns the `downscale_output` list containing lists for the training and testing data sets, models, and predicted maps for each ensemble member.
 
 
-NA_downscale <- function(data, coords, date, C_pool, covariates){
+NA_downscale <- function(preprocessed, date, C_pool, covariates_path) {
 
-  # Read the input data and site coordinates
-  input_data <- readRDS(data)
-  site_coordinates <- terra::vect(readr::read_csv(coords), geom=c("lon", "lat"), crs="EPSG:4326")
+  input_data <- preprocessed$input_data
+  site_coordinates <- preprocessed$site_coordinates
+  carbon_data <- preprocessed$carbon_data
 
-  # Extract the carbon data for the specified focus year
-  index <- which(names(input_data) == date)
-  data <- input_data[[index]]
-  carbon_data <- as.data.frame(t(data[which(names(data) == C_pool)]))
-  names(carbon_data) <- paste0("ensemble",seq(1:ncol(carbon_data)))
+  # Convert site coordinates to SpatVector
+  site_coordinates <- terra::vect(site_coordinates, geom = c("lon", "lat"), crs = "EPSG:4326")
+
+  # Load the covariates raster stack
+  covariates <- rast(covariates_path)
 
   # Extract predictors from covariates raster using site coordinates
-  predictors <- as.data.frame(terra::extract(covariates, site_coordinates,ID = FALSE)) 
+  predictors <- as.data.frame(terra::extract(covariates, site_coordinates, ID = FALSE))
 
   # Combine each ensemble member with all predictors
   ensembles <- list()
@@ -37,52 +94,249 @@ NA_downscale <- function(data, coords, date, C_pool, covariates){
 
   # Rename the carbon_data column for each ensemble member
   for (i in 1:length(ensembles)) {
-    ensembles[[i]] <- dplyr::rename(ensembles[[i]], "carbon_data" = "carbon_data[[i]]")
+    colnames(ensembles[[i]])[1] <- "carbon_data"
   }
 
   # Split the observations in each data frame into two data frames based on the proportion of 3/4
   ensembles <- lapply(ensembles, function(df) {
-    sample <- sample(1:nrow(df), size = round(0.75*nrow(df)))
-    train  <- df[sample, ]
-    test   <- df[-sample, ]
-    split_list <- list(train, test)
+    sample <- sample(1:nrow(df), size = round(0.75 * nrow(df)))
+    train <- df[sample, ]
+    test <- df[-sample, ]
+    split_list <- list(training = train, testing = test)
     return(split_list)
   })
 
-  # Rename the training and testing data frames for each ensemble member
+  # Train a CNN model for each ensemble member using the training data
+  cnn_output <- list()
   for (i in 1:length(ensembles)) {
-    # names(ensembles) <- paste0("ensemble",seq(1:length(ensembles)))
-    names(ensembles[[i]]) <- c("training", "testing")
+    # Prepare data for CNN
+    x_train <- as.matrix(ensembles[[i]]$training[, c("tavg", "prec", "srad", "vapr")])
+    y_train <- as.matrix(ensembles[[i]]$training$carbon_data)
+    x_test <- as.matrix(ensembles[[i]]$testing[, c("tavg", "prec", "srad", "vapr")])
+    y_test <- as.matrix(ensembles[[i]]$testing$carbon_data)
+
+    # Normalize the data
+    x_train <- scale(x_train)
+    x_test <- scale(x_test)
+
+    # Reshape data for CNN input (samples, timesteps, features)
+    x_train <- array_reshape(x_train, c(nrow(x_train), 1, ncol(x_train)))
+    x_test <- array_reshape(x_test, c(nrow(x_test), 1, ncol(x_test)))
+
+    # Define the CNN model
+    model <- keras_model_sequential() %>%
+      layer_conv_1d(filters = 64, kernel_size = 1, activation = 'relu', input_shape = c(1, 4)) %>%
+      layer_flatten() %>%
+      layer_dense(units = 64, activation = 'relu') %>%
+      layer_dense(units = 1)
+
+    # Compile the model
+    model %>% compile(
+      loss = 'mean_squared_error',
+      optimizer = optimizer_adam(),
+      metrics = c('mean_absolute_error')
+    )
+
+    # Train the model
+    model %>% fit(
+      x = x_train,
+      y = y_train,
+      epochs = 100,
+      batch_size = 32,
+      validation_split = 0.2,
+      verbose = 0
+    )
+
+    cnn_output[[i]] <- model
   }
 
-  # Train a random forest model for each ensemble member using the training data
-  rf_output <- list()
-  for (i in 1:length(ensembles)) {
-    rf_output[[i]] <- randomForest::randomForest(ensembles[[i]][[1]][["carbon_data"]] ~ land_cover+tavg+prec+srad+vapr+nitrogen+phh2o+soc+sand,
-                                                 data = ensembles[[i]][[1]],
-                                                 ntree = 1000,
-                                                 na.action = stats::na.omit,
-                                                 keep.forest = T,
-                                                 importance = T)
+  # Wrapper function to apply the trained model
+  predict_with_model <- function(model, data) {
+    data <- as.matrix(data[, c("tavg", "prec", "srad", "vapr")])
+    data <- scale(data)
+    data <- array_reshape(data, c(nrow(data), 1, ncol(data)))
+    predictions <- predict(model, data)
+    return(predictions)
   }
 
   # Generate predictions (maps) for each ensemble member using the trained models
-  maps <- list(ncol(rf_output))
-  for (i in 1:length(rf_output)) {
-    maps[[i]] <- terra::predict(object = covariates,
-                                model = rf_output[[i]],na.rm = T)
+  maps <- list()
+  predictions <- list()
+  for (i in 1:length(cnn_output)) {
+    # Prepare data for prediction
+    x_pred <- as.matrix(predictors[, c("tavg", "prec", "srad", "vapr")])
+    x_pred <- scale(x_pred)
+    x_pred <- array_reshape(x_pred, c(nrow(x_pred), 1, ncol(x_pred)))
+
+    map_pred <- predict_with_model(cnn_output[[i]], as.data.frame(covariates[]))
+    map_pred <- rast(matrix(map_pred, nrow = nrow(covariates), ncol = ncol(covariates)), ext = ext(covariates), crs = crs(covariates))
+    maps[[i]] <- map_pred
+
+    # Generate predictions for testing data
+    predictions[[i]] <- predict_with_model(cnn_output[[i]], ensembles[[i]]$testing)
+  }
+
+  # Calculate performance metrics for each ensemble member
+  metrics <- list()
+  for (i in 1:length(predictions)) {
+    actual <- ensembles[[i]]$testing$carbon_data
+    predicted <- predictions[[i]]
+    mse <- mean((actual - predicted)^2)
+    mae <- mean(abs(actual - predicted))
+    r_squared <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2)
+    metrics[[i]] <- list(MSE = mse, MAE = mae, R_squared = r_squared, actual = actual, predicted = predicted)
   }
 
   # Organize the results into a single output list
-  downscale_output <- list(ensembles, rf_output, maps)
+  downscale_output <- list(data = ensembles, models = cnn_output, maps = maps, metrics = metrics)
 
   # Rename each element of the output list with appropriate ensemble numbers
-  for (i in 1:length(downscale_output)) {
-    names(downscale_output[[i]]) <- paste0("ensemble",seq(1:length(downscale_output[[i]])))
+  for (i in 1:length(downscale_output$data)) {
+    names(downscale_output$data)[i] <- paste0("ensemble", i)
+    names(downscale_output$models)[i] <- paste0("ensemble", i)
+    names(downscale_output$maps)[i] <- paste0("ensemble", i)
+    names(downscale_output$metrics)[i] <- paste0("ensemble", i)
   }
 
-  # Rename the main components of the output list
-  names(downscale_output) <- c("data", "models", "maps")
-
   return(downscale_output)
 }
+
+# Define file paths for the data
+data_path <- " " # Replace with the actual path to your data file
+coords_path <- " "  # Replace with the actual path to your coordinates file
+covariates_path <- " " # Replace with the actual path to your .tiff file
+
+# Define parameters
+date <- " " # Replace with the actual date you want to use
+C_pool <- " " # Replace with the actual carbon pool name you want to use
+
+# Preprocess the data
+preprocessed_data <- preprocess(data_path, coords_path, date, C_pool)
+
+# Run the NA_downscale function
+result <- NA_downscale(preprocessed_data, date, C_pool, covariates_path)
+
+# Print the result
+print(result)
+
+# Print the accuracy metrics
+print("Accuracy Metrics for Each Ensemble:")
+for (i in seq_along(result$metrics)) {
+  cat(paste0("Ensemble ", i, ":"))
+  print(result$metrics[[i]])
+}
+
+# Function to create a Taylor diagram
+TaylorDiagram <- function(taylor_data, ...) {
+  # Convert data to the format required by the plot
+  taylor_data$color <- as.factor(taylor_data$Ensemble)
+
+  plot <- ggplot(taylor_data, aes(x = Modelled, y = Observed, color = color)) +
+    geom_point(size = 3) +
+    scale_color_manual(values = rainbow(length(unique(taylor_data$color)))) +
+    coord_fixed() +
+    theme_minimal() +
+    labs(title = "Taylor Diagram for Ensemble Members",
+         x = "Standard Deviation (normalised)",
+         y = "Standard Deviation (normalised)",
+         color = "Model") +
+    theme(legend.position = "right")
+
+  return(plot)
+}
+
+# Prepare data for Taylor diagram
+taylor_data <- data.frame()
+
+for (i in seq_along(result$metrics)) {
+  ensemble_name <- names(result$metrics)[i]
+  actual <- result$metrics[[i]]$actual
+  predicted <- result$metrics[[i]]$predicted
+
+  # Calculate required statistics
+  obs_sd <- sd(actual)
+  mod_sd <- sd(predicted)
+  correlation <- cor(actual, predicted)
+
+  # Normalize standard deviations
+  norm_obs_sd <- obs_sd / max(obs_sd, mod_sd)
+  norm_mod_sd <- mod_sd / max(obs_sd, mod_sd)
+
+  # Add to the data frame
+  taylor_data <- rbind(taylor_data, data.frame(
+    Ensemble = ensemble_name,
+    Observed = norm_obs_sd,
+    Modelled = norm_mod_sd,
+    r = correlation
+  ))
+}
+
+# Create and display Taylor diagram
+taylor_plot <- TaylorDiagram(taylor_data)
+print(taylor_plot)
+
+# Prepare metrics data for line plot with multiple y-axes
+metrics_df <- do.call(rbind, lapply(seq_along(result$metrics), function(i) {
+  data.frame(Ensemble = paste0("ensemble", i),
+             MSE = result$metrics[[i]]$MSE,
+             MAE = result$metrics[[i]]$MAE,
+             R_squared = result$metrics[[i]]$R_squared)
+}))
+
+# Reshape data for ggplot
+metrics_melted <- reshape2::melt(metrics_df, id.vars = "Ensemble")
+
+# Create a line plot with multiple y-axes using ggplot2
+p1 <- ggplot(metrics_melted[metrics_melted$variable %in% c("MSE", "MAE"), ], aes(x = Ensemble, y = value, color = variable, group = variable)) +
+  geom_line() +
+  geom_point() +
+  scale_y_continuous(name = "MSE and MAE") +
+  labs(title = "Performance Metrics for Each Ensemble",
+       x = "Ensemble",
+       color = "Metrics") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+p2 <- ggplot(metrics_melted[metrics_melted$variable == "R_squared", ], aes(x = Ensemble, y = value, color = variable, group = variable)) +
+  geom_line() +
+  geom_point() +
+  scale_y_continuous(name = "R_squared") +
+  labs(x = "Ensemble",
+       color = "Metrics") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+# Combine the plots
+grid.arrange(p1, p2, ncol = 1)
+
+# Scatter plot to compare actual and predicted values for each ensemble with one random instance
+set.seed(123)  # For reproducibility
+sampled_data <- do.call(rbind, lapply(seq_along(result$metrics), function(i) {
+  ensemble_name <- names(result$metrics)[i]
+  actual <- result$metrics[[i]]$actual
+  predicted <- result$metrics[[i]]$predicted
+
+  # Sample one random instance
+  sample_index <- sample(1:length(actual), 1)
+  actual_sample <- actual[sample_index]
+  predicted_sample <- predicted[sample_index]
+
+  data.frame(Ensemble = ensemble_name, Actual = actual_sample, Predicted = predicted_sample)
+}))
+
+# Create scatter plot with lines connecting actual and predicted values
+p3 <- ggplot(sampled_data, aes(x = Actual, y = Predicted, color = Ensemble)) +
+  geom_point(aes(x = Actual, y = Actual, shape = "Actual"), size = 3) + # Circle for actual values
+  geom_point(aes(x = Predicted, y = Predicted, shape = "Predicted"), size = 3) + # Square for predicted values
+  geom_segment(aes(x = Actual, y = Actual, xend = Actual, yend = Predicted), linetype = "dotted") + # Connect actual to predicted
+  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
+  geom_smooth(method = "lm", linetype = "solid", se = FALSE, color = "blue") + # Regression line
+  labs(title = "Actual vs. Predicted Scatter Plot for Random Samples",
+       x = "Actual",
+       y = "Predicted",
+       color = "Ensemble",
+       shape = "Type") +
+  scale_shape_manual(values = c("Actual" = 16, "Predicted" = 22)) + # Define shapes
+  theme_minimal() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+# Display scatter plot
+print(p3)