Updating to have definitions_id as an option as well as station_id

IDEMSInternational · Dec 7, 2024 · 9fc0376 · 9fc0376
1 parent af44d1e
commit 9fc0376
Show file tree

Hide file tree

Showing 13 changed files with 298 additions and 123 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -60,10 +60,8 @@ importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
 importFrom(dplyr,mutate)
 importFrom(googleCloudStorageR,gcs_auth)
-importFrom(googleCloudStorageR,gcs_get_object)
 importFrom(googleCloudStorageR,gcs_list_objects)
 importFrom(googleCloudStorageR,gcs_upload)
-importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,read_json)
 importFrom(magrittr,"%>%")
 importFrom(purrr,list_rbind)

diff --git a/R/build_crop_definitions.R b/R/build_crop_definitions.R
@@ -24,12 +24,17 @@ build_crop_definitions <- function(definition_file = NULL){
     planting_dates <- split_list(values$Var2)
     planting_length <- split_list(values$Var3)
   }
+
   # Loop through variables and add to the list if defined
   for (variable in variables_list) {
-    if (exists(variable) && !is.na(get(variable))) {
-      data_list[["crops_success"]][[variable]] <- get(variable)
+    if (!is.null(definition_file)){
+      if (exists(variable) && !is.na(get(variable))) {
+        data_list[["crops_success"]][[variable]] <- get(variable)
+      } else {
+        data_list[["crops_success"]][[variable]] <- NA
+      }
     } else {
-      data_list[["crops_success"]][[variable]] <- NA
+        data_list[["crops_success"]][[variable]] <- NA
     }
   }
   return(data_list)

diff --git a/R/get_definitions_data.R b/R/get_definitions_data.R
@@ -1,83 +1,105 @@
-#' Get Daily Definitions Data
+#' Get Definitions Data
 #'
-#' This function retrieves definitions data for weather stations from a Google Cloud Storage (GCS) bucket. It includes timestamp handling to ensure that the most recent definitions file is imported.
+#' This function retrieves definitions data for weather stations or specific definitions IDs from a Google Cloud Storage (GCS) bucket.
+#' It includes logic to handle station-based retrieval or fetch data directly using a specified `definition_id`. It also handles 
+#' timestamp management to ensure the most recent definitions file is imported.
 #'
-#' @param country A character vector specifying the country or countries from which to get the definitions data. Options are any defined in `get_bucket_name()`. Common options are `"mz"` and `"zm"`.
-#' @param station_id A character string specifying the ID of the station for which to get the definitions data.
-#' @param definitions_id A character string specifying the ID of the definitions for which to get the definitions data. If `NULL` this is found from the metadata.
-#' @param file Default `NULL` meaning that the most recent definitions file will be found and imported. Otherwise specify as a string the file to import. In format: "STATIONNAME.TIMESTAMP" e.g. "1.20240311152831"
+#' @param country A character vector of length 1 specifying the country from which to get the definitions data.
+#'                Options depend on the implementation of `get_bucket_name()`, with common options being `"mz"` (Mozambique)
+#'                and `"zm"` (Zambia).
+#' @param station_id A character vector specifying the ID(s) of the station(s) for which to get the definitions data. 
+#'                   If `NULL`, data is fetched using the `definition_id`. Defaults to `NULL`.
+#' @param definition_id A character string specifying the ID of the definitions to retrieve. If `NULL` and `station_id` is provided,
+#'                       the most recent definitions ID is determined from metadata. Defaults to `NULL`.
+#' @param file A character string specifying the name of a specific file to import, in the format `"STATIONNAME.TIMESTAMP"`. 
+#'             If `NULL`, the most recent definitions file is fetched automatically. Defaults to `NULL`.
 #'
-#' @return A data frame containing daily data based on the station ID.
+#' @return A data frame or list containing the definitions data:
+#' - If `station_id` is provided, returns data specific to the station(s).
+#' - If `station_id` is `NULL`, returns data specific to the `definition_id`.
 #'
-#' @importFrom googleCloudStorageR gcs_list_objects gcs_get_object
-#' @importFrom jsonlite fromJSON
-#'
-#' @seealso
-#' \code{update_definitions_data} for updating definitions files.
+#' @details
+#' - When `station_id` is provided, the function fetches the corresponding definitions data for each station.
+#' - If `station_id` is `NULL`, the function directly retrieves data based on the provided `definition_id`.
+#' - The function uses Google Cloud Storage to retrieve the files, ensuring that the most recent versions are accessed when `file` is `NULL`.
+#' - For multiple stations, the function returns a combined data frame.
 #'
 #' @export
-#'
-#' @examples # todo
-get_definitions_data <- function(country, station_id, definitions_id = NULL, file = NULL) {
+get_definitions_data <- function(country, station_id = NULL, definition_id = NULL, file = NULL) {
   if (length(country) > 1) stop("'country' must be of length 1")
-  station_id <- as.character(station_id)
-  dfs <- vector("list", length(station_id))
-  names(dfs) <- station_id
-
-  # for (i in seq_along(station_id)) {
-  #   f <- paste0(country, "/", "definitions", "/", station_id[i], ".json")
-  #   if (file.exists(f)) {
-  #     dfs[[i]] <- jsonlite::read_json(f)
-  #   } else {
-  #     f <- update_definitions_data(country, station_id[i])
-  #     dfs[[i]] <- f #jsonlite::write_json(f)
-  #   }
-  # }
-  # TODO: set up so if (is.null(definitions_id)) runs station_data, otherwise we cll it through from params
-  station_data <- station_metadata(country = country, station_id = station_id)
-  definitions_id_list <- lapply(station_data$definitions_id, function(x) x[length(x)])
-  names(definitions_id_list) <- station_id
-
-  if (is.null(file)){
+  if (is.null(station_id)){
+    # return the data for that definitions ID
     bucket_name <- get_bucket_name(country)
-    for (i in seq_along(station_id)) {
-      if (is.null(definitions_id)) definitions_id <- definitions_id_list[[i]]
-      # List all files in the "definitions" directory for the station
-      files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name,
-                                                     prefix = paste0("definitions/", definitions_id, "."),
-                                                     versions = TRUE)
-
-      if (nrow(files) == 0) { stop("No files found. Check country and station_id")}
-      # Filter files with the ".json" extension
-      files <- files %>% dplyr::filter(grepl("\\.json$", name))
-      json_files <- files$name
-
-      # Check if multiple json files found. If so, take hte most recent one.
-      if (length(json_files) >= 1){
-        # Extract timestamps from file names
-        definitions_id[i] <- extract_most_recent_json(json_files)
-      }
-      f <- paste0("definitions/", definitions_id[i], ".json")
-      if (file.exists(f)) {
-        dfs[[i]] <- jsonlite::read_json(f)
-      } else {
-        f <- update_definitions_data(country, definitions_id[i])
-        dfs[[i]] <- f #jsonlite::write_json(f)
-      }
-    }
-    if (length(station_id) > 1) {
-      station_data <- dplyr::bind_rows(dfs)
+
+    files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name,
+                                                   prefix = paste0("definitions/", definition_id, "."),
+                                                   versions = TRUE)
+
+    if (nrow(files) == 0) { stop("No files found. Check country and station_id")}
+    # Filter files with the ".json" extension
+    files <- files %>% dplyr::filter(grepl("\\.json$", name))
+    json_files <- files$name
+
+    # Check if multiple json files found. If so, take hte most recent one.
+    if (length(json_files) >= 1) definition_id <- extract_most_recent_json(json_files)
+    f <- paste0("definitions/", definition_id, ".json")
+    if (file.exists(f)) {
+      definitions_data <- jsonlite::read_json(f)
     } else {
-      station_data <- dfs[[1]]
+      f <- update_definitions_data(country, definition_id)
+      definitions_data <- f #jsonlite::write_json(f)
     }
+    return(definitions_data)
   } else {
-    f <- paste0("definitions/", file, ".json")
-    if (file.exists(f)) {
-      station_data <- jsonlite::read_json(f)
+    station_id <- as.character(station_id)
+    dfs <- vector("list", length(station_id))
+    names(dfs) <- station_id
+
+    station_data <- station_metadata(country = country, station_id = station_id)
+    definition_id_list <- lapply(station_data$definition_id, function(x) x[length(x)])
+    names(definition_id_list) <- station_id
+
+    if (is.null(file)){
+      bucket_name <- get_bucket_name(country)
+      for (i in seq_along(station_id)) {
+        if (is.null(definition_id)) definition_id <- definition_id_list[[i]]
+        # List all files in the "definitions" directory for the station
+        files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name,
+                                                       prefix = paste0("definitions/", definition_id, "."),
+                                                       versions = TRUE)
+
+        if (nrow(files) == 0) { stop("No files found. Check country and station_id")}
+        # Filter files with the ".json" extension
+        files <- files %>% dplyr::filter(grepl("\\.json$", name))
+        json_files <- files$name
+
+        # Check if multiple json files found. If so, take hte most recent one.
+        if (length(json_files) >= 1){
+          # Extract timestamps from file names
+          definition_id[i] <- extract_most_recent_json(json_files)
+        }
+        f <- paste0("definitions/", definition_id[i], ".json")
+        if (file.exists(f)) {
+          dfs[[i]] <- jsonlite::read_json(f)
+        } else {
+          f <- update_definitions_data(country, definition_id[i])
+          dfs[[i]] <- f #jsonlite::write_json(f)
+        }
+      }
+      if (length(station_id) > 1) {
+        station_data <- dplyr::bind_rows(dfs)
+      } else {
+        station_data <- dfs[[1]]
+      }
     } else {
-      f <- update_definitions_data(country, file)
-      station_data <- f
+      f <- paste0("definitions/", file, ".json")
+      if (file.exists(f)) {
+        station_data <- jsonlite::read_json(f)
+      } else {
+        f <- update_definitions_data(country, file)
+        station_data <- f
+      }
     }
+    return(station_data)
   }
-  return(station_data)
 }
diff --git a/R/update_crop_success_probabilities_from_definition.R b/R/update_crop_success_probabilities_from_definition.R
@@ -0,0 +1,74 @@
+update_crop_success_probabilities_from_definition <- function(country, station_id = NULL, definitions_id = NULL, daily_data, summary_rains_data = NULL) {
+  if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.")
+  # Retrieve the most recent definition data for the specified country and station
+  if (!is.null(station_id)){
+    definitions_data <- get_definitions_data(country = country, station_id = station_id)
+  } else {
+    definitions_data <- get_definitions_data(country = country, definition_id = definition_id)
+  }
+
+  # If start-of-rains data is not provided, compute it using daily rainfall data
+  if (is.null(summary_rains_data)) {
+    summaries <- "start_rains"
+    if (!is.null(definitions_data$end_rains$start_day)){
+      summaries <- c(summaries, "end_rains")
+      end_doy <- "end_rains_doy"
+    }
+    if (!is.null(definitions_data$end_season$start_day)){
+      summaries <- c(summaries, "end_season")
+      end_doy <- "end_season_doy"
+    }   
+
+    summary_rains_data <- update_rainfall_summaries_from_definition(
+      country = country, 
+      station_id = station_id, 
+      daily_data = daily_data, 
+      summaries = summaries
+    )
+
+    # for end_rains - do we do end_rains by default then otherwise do end_season?
+    # what if we have both end reains and end season?
+  }
+
+  # Extract the column names for the start-of-rains data
+  data_names <- data_definitions(names(daily_data), FALSE, FALSE)
+
+  # Retrieve the specified days for calculating season start probabilities
+  water_requirements <- as.integer(definitions_data$crops_success$water_requirements)
+  planting_dates <- as.integer(definitions_data$crops_success$planting_dates)
+  planting_length <- as.integer(definitions_data$crops_success$planting_length)
+  summary_data <- NULL
+  if (length(water_requirements) == 0){
+    warning("No specified days given for water requirements. No updates required.")
+    return(summary_data)
+  }
+  if (length(planting_dates) == 0){
+    warning("No specified days given for planting dates No updates required.")
+    return(summary_data)
+  }
+  if (length(planting_length) == 0){
+    warning("No specified days given for planting length No updates required.")
+    return(summary_data)
+  }
+
+  start_before_season <- definitions_data$crops_success$start_check
+  if (is.null(start_before_season)) start_before_season <- FALSE
+
+  # Calculate season start probabilities
+  daily_data[[data_names$year]] <- as.factor(as.character(daily_data[[data_names$year]]))
+  summary_rains_data[[data_names$year]] <- as.factor(as.character(summary_rains_data[[data_names$year]]))
+  summary_data <- rpicsa::crops_definitions(data = daily_data,
+                                            date_time  = data_names$date,
+                                            station = data_names$station,
+                                            year = data_names$year,
+                                            rain = data_names$rain,
+                                            water_requirements = water_requirements,
+                                            planting_dates = planting_dates,
+                                            planting_length = planting_length,
+                                            start_check = start_before_season,
+                                            season_data = summary_rains_data,
+                                            start_day = "start_rains_doy",
+                                            end_day = end_doy)
+
+  return(summary_data)
+}
diff --git a/R/update_season_start_probabilities_from_definition.R b/R/update_season_start_probabilities_from_definition.R
@@ -4,7 +4,8 @@
 #' for a given country and station. It uses rainfall data or start-of-rains summaries to compute the probabilities.
 #'
 #' @param country Character. The name of the country for which definitions and observations are retrieved.
-#' @param station_id Character. The station ID for which data is retrieved. Defaults to "Lundazi Met".
+#' @param station_id Character. The station ID(s) for which data is retrieved. Can be `NULL` if `definition_id` is specified. Defaults to `NULL`.
+#' @param definition_id Character. The ID of the definitions to use for generating summaries. Only used if `station_id` is `NULL`. Defaults to `NULL`.
 #' @param daily_data Data frame. (Optional) Daily rainfall data, required if `start_rains_data` is not provided. Defaults to `NULL`.
 #' @param start_rains_data Data frame. (Optional) Precomputed start-of-rains data. If not provided, it is generated using `daily_data`. Defaults to `NULL`.
 #'
@@ -16,9 +17,14 @@
 #' - If `start_rains_data` is not provided, it computes the start-of-rains summaries using `daily_data`.
 #' - The function calculates probabilities for the specified days using the `rpicsa::probability_season_start` function.
 #'
-update_season_start_probabilities_from_definition <- function(country, station_id, daily_data = NULL, start_rains_data = NULL) {
+update_season_start_probabilities_from_definition <- function(country, station_id = NULL, definition_id = NULL, daily_data = NULL, start_rains_data = NULL) {
+  if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.")
   # Retrieve the most recent definition data for the specified country and station
-  definitions_data <- epicsawrap::get_definitions_data(country = country, station_id = station_id)
+  if (!is.null(station_id)){
+    definitions_data <- get_definitions_data(country = country, station_id = station_id)
+  } else {
+    definitions_data <- get_definitions_data(country = country, definition_id = definition_id)
+  }
 
   # If start-of-rains data is not provided, compute it using daily rainfall data
   if (is.null(start_rains_data)) {

diff --git a/R/update_summaries_from_definition.R b/R/update_summaries_from_definition.R
@@ -1,23 +1,40 @@
 #' Update Rainfall Summaries from Definitions
 #'
-#' This function generates summary data based on the most recent definitions and observed rainfall data
-#' for a specified country and weather station. It calculates various metrics, such as start and end
-#' of rains, seasonal length, seasonal rainfall, and annual rainfall, depending on the definitions provided.
+#' This function generates rainfall summary data based on the most recent definitions and observed rainfall data
+#' for a specified country. Summaries include metrics such as start and end of rains, seasonal length, seasonal rainfall, 
+#' and annual rainfall, depending on the definitions and specified summary types.
 #'
-#' @param country Character. The name of the country for which definitions and observations are retrieved. Defaults to "zm_workshops".
-#' @param station_id Character. The station ID for which data is retrieved. Defaults to "Lundazi Met".
-#' @param daily_data Data frame. The daily data to update.
-#' @param summaries `character` The names of the summaries to produce.
+#' @param country Character. The name of the country for which definitions and observations are retrieved. Defaults to `"zm_workshops"`.
+#' @param station_id Character. The station ID(s) for which data is retrieved. Can be `NULL` if `definition_id` is specified. Defaults to `NULL`.
+#' @param definition_id Character. The ID of the definitions to use for generating summaries. Only used if `station_id` is `NULL`. Defaults to `NULL`.
+#' @param daily_data Data frame. The daily rainfall data used to generate summaries.
+#' @param summaries Character vector. The names of the summaries to produce. Options include:
+#'   - `"annual_rain"`: Calculates total annual rainfall.
+#'   - `"start_rains"`: Identifies the start of the rainy season.
+#'   - `"end_rains"`: Identifies the end of the rainy season.
+#'   - `"end_season"`: Identifies the end of the season.
+#'   - `"seasonal_rain"`: Calculates total seasonal rainfall.
+#'   - `"seasonal_length"`: Calculates the length of the rainy season.
+#'   Defaults to all available summaries.
 #' 
-#' @return A data frame containing summarized rainfall data for the specified station and definitions.
+#' @return A data frame containing the requested rainfall summaries for the specified station(s) or definitions.
+#'
+#' @details
+#' - If `station_id` is provided, the function retrieves the corresponding definitions data for the station.
+#' - If `station_id` is `NULL`, the function fetches definitions data directly using the `definition_id`.
+#' - The function only calculates the summaries specified in the `summaries` argument.
+#' - Summary calculations depend on definitions being available for the requested metrics.
+#'
 #' @export
-#' @examples
-#' #update_rainfall_summaries_from_definition(country = "zm_workshops", station_id = "Lundazi Met")
-
-update_rainfall_summaries_from_definition <- function(country = "zm_workshops", station_id = "Lundazi Met", daily_data,
+update_rainfall_summaries_from_definition <- function(country = "zm_workshops", station_id = NULL, definition_id = NULL, daily_data,
                                                       summaries = c("annual_rain", "start_rains", "end_rains", "end_season", "seasonal_rain", "seasonal_length")) {
+  if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.")
   # Retrieve the most recent definition data for the specified country and station
-  definitions_data <- epicsawrap::get_definitions_data(country = country, station_id = station_id)
+  if (!is.null(station_id)){
+    definitions_data <- get_definitions_data(country = country, station_id = station_id)
+  } else {
+    definitions_data <- get_definitions_data(country = country, definition_id = definition_id)
+  }
 
   # Initialize variables for storing summary data and summaries
   summary_data <- NULL