Skip to content

Commit

Permalink
Updating to have definitions_id as an option as well as station_id
Browse files Browse the repository at this point in the history
  • Loading branch information
lilyclements committed Dec 7, 2024
1 parent af44d1e commit 9fc0376
Show file tree
Hide file tree
Showing 13 changed files with 298 additions and 123 deletions.
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,8 @@ importFrom(dplyr,filter)
importFrom(dplyr,full_join)
importFrom(dplyr,mutate)
importFrom(googleCloudStorageR,gcs_auth)
importFrom(googleCloudStorageR,gcs_get_object)
importFrom(googleCloudStorageR,gcs_list_objects)
importFrom(googleCloudStorageR,gcs_upload)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,read_json)
importFrom(magrittr,"%>%")
importFrom(purrr,list_rbind)
Expand Down
11 changes: 8 additions & 3 deletions R/build_crop_definitions.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,17 @@ build_crop_definitions <- function(definition_file = NULL){
planting_dates <- split_list(values$Var2)
planting_length <- split_list(values$Var3)
}

# Loop through variables and add to the list if defined
for (variable in variables_list) {
if (exists(variable) && !is.na(get(variable))) {
data_list[["crops_success"]][[variable]] <- get(variable)
if (!is.null(definition_file)){
if (exists(variable) && !is.na(get(variable))) {
data_list[["crops_success"]][[variable]] <- get(variable)
} else {
data_list[["crops_success"]][[variable]] <- NA
}
} else {
data_list[["crops_success"]][[variable]] <- NA
data_list[["crops_success"]][[variable]] <- NA
}
}
return(data_list)
Expand Down
158 changes: 90 additions & 68 deletions R/get_definitions_data.R
Original file line number Diff line number Diff line change
@@ -1,83 +1,105 @@
#' Get Daily Definitions Data
#' Get Definitions Data
#'
#' This function retrieves definitions data for weather stations from a Google Cloud Storage (GCS) bucket. It includes timestamp handling to ensure that the most recent definitions file is imported.
#' This function retrieves definitions data for weather stations or specific definitions IDs from a Google Cloud Storage (GCS) bucket.
#' It includes logic to handle station-based retrieval or fetch data directly using a specified `definition_id`. It also handles
#' timestamp management to ensure the most recent definitions file is imported.
#'
#' @param country A character vector specifying the country or countries from which to get the definitions data. Options are any defined in `get_bucket_name()`. Common options are `"mz"` and `"zm"`.
#' @param station_id A character string specifying the ID of the station for which to get the definitions data.
#' @param definitions_id A character string specifying the ID of the definitions for which to get the definitions data. If `NULL` this is found from the metadata.
#' @param file Default `NULL` meaning that the most recent definitions file will be found and imported. Otherwise specify as a string the file to import. In format: "STATIONNAME.TIMESTAMP" e.g. "1.20240311152831"
#' @param country A character vector of length 1 specifying the country from which to get the definitions data.
#' Options depend on the implementation of `get_bucket_name()`, with common options being `"mz"` (Mozambique)
#' and `"zm"` (Zambia).
#' @param station_id A character vector specifying the ID(s) of the station(s) for which to get the definitions data.
#' If `NULL`, data is fetched using the `definition_id`. Defaults to `NULL`.
#' @param definition_id A character string specifying the ID of the definitions to retrieve. If `NULL` and `station_id` is provided,
#' the most recent definitions ID is determined from metadata. Defaults to `NULL`.
#' @param file A character string specifying the name of a specific file to import, in the format `"STATIONNAME.TIMESTAMP"`.
#' If `NULL`, the most recent definitions file is fetched automatically. Defaults to `NULL`.
#'
#' @return A data frame containing daily data based on the station ID.
#' @return A data frame or list containing the definitions data:
#' - If `station_id` is provided, returns data specific to the station(s).
#' - If `station_id` is `NULL`, returns data specific to the `definition_id`.
#'
#' @importFrom googleCloudStorageR gcs_list_objects gcs_get_object
#' @importFrom jsonlite fromJSON
#'
#' @seealso
#' \code{update_definitions_data} for updating definitions files.
#' @details
#' - When `station_id` is provided, the function fetches the corresponding definitions data for each station.
#' - If `station_id` is `NULL`, the function directly retrieves data based on the provided `definition_id`.
#' - The function uses Google Cloud Storage to retrieve the files, ensuring that the most recent versions are accessed when `file` is `NULL`.
#' - For multiple stations, the function returns a combined data frame.
#'
#' @export
#'
#' @examples # todo
get_definitions_data <- function(country, station_id, definitions_id = NULL, file = NULL) {
get_definitions_data <- function(country, station_id = NULL, definition_id = NULL, file = NULL) {
if (length(country) > 1) stop("'country' must be of length 1")
station_id <- as.character(station_id)
dfs <- vector("list", length(station_id))
names(dfs) <- station_id

# for (i in seq_along(station_id)) {
# f <- paste0(country, "/", "definitions", "/", station_id[i], ".json")
# if (file.exists(f)) {
# dfs[[i]] <- jsonlite::read_json(f)
# } else {
# f <- update_definitions_data(country, station_id[i])
# dfs[[i]] <- f #jsonlite::write_json(f)
# }
# }
# TODO: set up so if (is.null(definitions_id)) runs station_data, otherwise we cll it through from params
station_data <- station_metadata(country = country, station_id = station_id)
definitions_id_list <- lapply(station_data$definitions_id, function(x) x[length(x)])
names(definitions_id_list) <- station_id

if (is.null(file)){
if (is.null(station_id)){
# return the data for that definitions ID
bucket_name <- get_bucket_name(country)
for (i in seq_along(station_id)) {
if (is.null(definitions_id)) definitions_id <- definitions_id_list[[i]]
# List all files in the "definitions" directory for the station
files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name,
prefix = paste0("definitions/", definitions_id, "."),
versions = TRUE)

if (nrow(files) == 0) { stop("No files found. Check country and station_id")}
# Filter files with the ".json" extension
files <- files %>% dplyr::filter(grepl("\\.json$", name))
json_files <- files$name

# Check if multiple json files found. If so, take hte most recent one.
if (length(json_files) >= 1){
# Extract timestamps from file names
definitions_id[i] <- extract_most_recent_json(json_files)
}
f <- paste0("definitions/", definitions_id[i], ".json")
if (file.exists(f)) {
dfs[[i]] <- jsonlite::read_json(f)
} else {
f <- update_definitions_data(country, definitions_id[i])
dfs[[i]] <- f #jsonlite::write_json(f)
}
}
if (length(station_id) > 1) {
station_data <- dplyr::bind_rows(dfs)

files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name,
prefix = paste0("definitions/", definition_id, "."),
versions = TRUE)

if (nrow(files) == 0) { stop("No files found. Check country and station_id")}
# Filter files with the ".json" extension
files <- files %>% dplyr::filter(grepl("\\.json$", name))
json_files <- files$name

# Check if multiple json files found. If so, take hte most recent one.
if (length(json_files) >= 1) definition_id <- extract_most_recent_json(json_files)
f <- paste0("definitions/", definition_id, ".json")
if (file.exists(f)) {
definitions_data <- jsonlite::read_json(f)
} else {
station_data <- dfs[[1]]
f <- update_definitions_data(country, definition_id)
definitions_data <- f #jsonlite::write_json(f)
}
return(definitions_data)
} else {
f <- paste0("definitions/", file, ".json")
if (file.exists(f)) {
station_data <- jsonlite::read_json(f)
station_id <- as.character(station_id)
dfs <- vector("list", length(station_id))
names(dfs) <- station_id

station_data <- station_metadata(country = country, station_id = station_id)
definition_id_list <- lapply(station_data$definition_id, function(x) x[length(x)])
names(definition_id_list) <- station_id

if (is.null(file)){
bucket_name <- get_bucket_name(country)
for (i in seq_along(station_id)) {
if (is.null(definition_id)) definition_id <- definition_id_list[[i]]
# List all files in the "definitions" directory for the station
files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name,
prefix = paste0("definitions/", definition_id, "."),
versions = TRUE)

if (nrow(files) == 0) { stop("No files found. Check country and station_id")}
# Filter files with the ".json" extension
files <- files %>% dplyr::filter(grepl("\\.json$", name))
json_files <- files$name

# Check if multiple json files found. If so, take hte most recent one.
if (length(json_files) >= 1){
# Extract timestamps from file names
definition_id[i] <- extract_most_recent_json(json_files)
}
f <- paste0("definitions/", definition_id[i], ".json")
if (file.exists(f)) {
dfs[[i]] <- jsonlite::read_json(f)
} else {
f <- update_definitions_data(country, definition_id[i])
dfs[[i]] <- f #jsonlite::write_json(f)
}
}
if (length(station_id) > 1) {
station_data <- dplyr::bind_rows(dfs)
} else {
station_data <- dfs[[1]]
}
} else {
f <- update_definitions_data(country, file)
station_data <- f
f <- paste0("definitions/", file, ".json")
if (file.exists(f)) {
station_data <- jsonlite::read_json(f)
} else {
f <- update_definitions_data(country, file)
station_data <- f
}
}
return(station_data)
}
return(station_data)
}
74 changes: 74 additions & 0 deletions R/update_crop_success_probabilities_from_definition.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
update_crop_success_probabilities_from_definition <- function(country, station_id = NULL, definitions_id = NULL, daily_data, summary_rains_data = NULL) {
if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.")
# Retrieve the most recent definition data for the specified country and station
if (!is.null(station_id)){
definitions_data <- get_definitions_data(country = country, station_id = station_id)
} else {
definitions_data <- get_definitions_data(country = country, definition_id = definition_id)
}

# If start-of-rains data is not provided, compute it using daily rainfall data
if (is.null(summary_rains_data)) {
summaries <- "start_rains"
if (!is.null(definitions_data$end_rains$start_day)){
summaries <- c(summaries, "end_rains")
end_doy <- "end_rains_doy"
}
if (!is.null(definitions_data$end_season$start_day)){
summaries <- c(summaries, "end_season")
end_doy <- "end_season_doy"
}

summary_rains_data <- update_rainfall_summaries_from_definition(
country = country,
station_id = station_id,
daily_data = daily_data,
summaries = summaries
)

# for end_rains - do we do end_rains by default then otherwise do end_season?
# what if we have both end reains and end season?
}

# Extract the column names for the start-of-rains data
data_names <- data_definitions(names(daily_data), FALSE, FALSE)

# Retrieve the specified days for calculating season start probabilities
water_requirements <- as.integer(definitions_data$crops_success$water_requirements)
planting_dates <- as.integer(definitions_data$crops_success$planting_dates)
planting_length <- as.integer(definitions_data$crops_success$planting_length)
summary_data <- NULL
if (length(water_requirements) == 0){
warning("No specified days given for water requirements. No updates required.")
return(summary_data)
}
if (length(planting_dates) == 0){
warning("No specified days given for planting dates No updates required.")
return(summary_data)
}
if (length(planting_length) == 0){
warning("No specified days given for planting length No updates required.")
return(summary_data)
}

start_before_season <- definitions_data$crops_success$start_check
if (is.null(start_before_season)) start_before_season <- FALSE

# Calculate season start probabilities
daily_data[[data_names$year]] <- as.factor(as.character(daily_data[[data_names$year]]))
summary_rains_data[[data_names$year]] <- as.factor(as.character(summary_rains_data[[data_names$year]]))
summary_data <- rpicsa::crops_definitions(data = daily_data,
date_time = data_names$date,
station = data_names$station,
year = data_names$year,
rain = data_names$rain,
water_requirements = water_requirements,
planting_dates = planting_dates,
planting_length = planting_length,
start_check = start_before_season,
season_data = summary_rains_data,
start_day = "start_rains_doy",
end_day = end_doy)

return(summary_data)
}
12 changes: 9 additions & 3 deletions R/update_season_start_probabilities_from_definition.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
#' for a given country and station. It uses rainfall data or start-of-rains summaries to compute the probabilities.
#'
#' @param country Character. The name of the country for which definitions and observations are retrieved.
#' @param station_id Character. The station ID for which data is retrieved. Defaults to "Lundazi Met".
#' @param station_id Character. The station ID(s) for which data is retrieved. Can be `NULL` if `definition_id` is specified. Defaults to `NULL`.
#' @param definition_id Character. The ID of the definitions to use for generating summaries. Only used if `station_id` is `NULL`. Defaults to `NULL`.
#' @param daily_data Data frame. (Optional) Daily rainfall data, required if `start_rains_data` is not provided. Defaults to `NULL`.
#' @param start_rains_data Data frame. (Optional) Precomputed start-of-rains data. If not provided, it is generated using `daily_data`. Defaults to `NULL`.
#'
Expand All @@ -16,9 +17,14 @@
#' - If `start_rains_data` is not provided, it computes the start-of-rains summaries using `daily_data`.
#' - The function calculates probabilities for the specified days using the `rpicsa::probability_season_start` function.
#'
update_season_start_probabilities_from_definition <- function(country, station_id, daily_data = NULL, start_rains_data = NULL) {
update_season_start_probabilities_from_definition <- function(country, station_id = NULL, definition_id = NULL, daily_data = NULL, start_rains_data = NULL) {
if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.")
# Retrieve the most recent definition data for the specified country and station
definitions_data <- epicsawrap::get_definitions_data(country = country, station_id = station_id)
if (!is.null(station_id)){
definitions_data <- get_definitions_data(country = country, station_id = station_id)
} else {
definitions_data <- get_definitions_data(country = country, definition_id = definition_id)
}

# If start-of-rains data is not provided, compute it using daily rainfall data
if (is.null(start_rains_data)) {
Expand Down
43 changes: 30 additions & 13 deletions R/update_summaries_from_definition.R
Original file line number Diff line number Diff line change
@@ -1,23 +1,40 @@
#' Update Rainfall Summaries from Definitions
#'
#' This function generates summary data based on the most recent definitions and observed rainfall data
#' for a specified country and weather station. It calculates various metrics, such as start and end
#' of rains, seasonal length, seasonal rainfall, and annual rainfall, depending on the definitions provided.
#' This function generates rainfall summary data based on the most recent definitions and observed rainfall data
#' for a specified country. Summaries include metrics such as start and end of rains, seasonal length, seasonal rainfall,
#' and annual rainfall, depending on the definitions and specified summary types.
#'
#' @param country Character. The name of the country for which definitions and observations are retrieved. Defaults to "zm_workshops".
#' @param station_id Character. The station ID for which data is retrieved. Defaults to "Lundazi Met".
#' @param daily_data Data frame. The daily data to update.
#' @param summaries `character` The names of the summaries to produce.
#' @param country Character. The name of the country for which definitions and observations are retrieved. Defaults to `"zm_workshops"`.
#' @param station_id Character. The station ID(s) for which data is retrieved. Can be `NULL` if `definition_id` is specified. Defaults to `NULL`.
#' @param definition_id Character. The ID of the definitions to use for generating summaries. Only used if `station_id` is `NULL`. Defaults to `NULL`.
#' @param daily_data Data frame. The daily rainfall data used to generate summaries.
#' @param summaries Character vector. The names of the summaries to produce. Options include:
#' - `"annual_rain"`: Calculates total annual rainfall.
#' - `"start_rains"`: Identifies the start of the rainy season.
#' - `"end_rains"`: Identifies the end of the rainy season.
#' - `"end_season"`: Identifies the end of the season.
#' - `"seasonal_rain"`: Calculates total seasonal rainfall.
#' - `"seasonal_length"`: Calculates the length of the rainy season.
#' Defaults to all available summaries.
#'
#' @return A data frame containing summarized rainfall data for the specified station and definitions.
#' @return A data frame containing the requested rainfall summaries for the specified station(s) or definitions.
#'
#' @details
#' - If `station_id` is provided, the function retrieves the corresponding definitions data for the station.
#' - If `station_id` is `NULL`, the function fetches definitions data directly using the `definition_id`.
#' - The function only calculates the summaries specified in the `summaries` argument.
#' - Summary calculations depend on definitions being available for the requested metrics.
#'
#' @export
#' @examples
#' #update_rainfall_summaries_from_definition(country = "zm_workshops", station_id = "Lundazi Met")

update_rainfall_summaries_from_definition <- function(country = "zm_workshops", station_id = "Lundazi Met", daily_data,
update_rainfall_summaries_from_definition <- function(country = "zm_workshops", station_id = NULL, definition_id = NULL, daily_data,
summaries = c("annual_rain", "start_rains", "end_rains", "end_season", "seasonal_rain", "seasonal_length")) {
if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.")
# Retrieve the most recent definition data for the specified country and station
definitions_data <- epicsawrap::get_definitions_data(country = country, station_id = station_id)
if (!is.null(station_id)){
definitions_data <- get_definitions_data(country = country, station_id = station_id)
} else {
definitions_data <- get_definitions_data(country = country, definition_id = definition_id)
}

# Initialize variables for storing summary data and summaries
summary_data <- NULL
Expand Down
Loading

0 comments on commit 9fc0376

Please sign in to comment.