-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updating to have definitions_id as an option as well as station_id
- Loading branch information
1 parent
af44d1e
commit 9fc0376
Showing
13 changed files
with
298 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,83 +1,105 @@ | ||
#' Get Daily Definitions Data | ||
#' Get Definitions Data | ||
#' | ||
#' This function retrieves definitions data for weather stations from a Google Cloud Storage (GCS) bucket. It includes timestamp handling to ensure that the most recent definitions file is imported. | ||
#' This function retrieves definitions data for weather stations or specific definitions IDs from a Google Cloud Storage (GCS) bucket. | ||
#' It includes logic to handle station-based retrieval or fetch data directly using a specified `definition_id`. It also handles | ||
#' timestamp management to ensure the most recent definitions file is imported. | ||
#' | ||
#' @param country A character vector specifying the country or countries from which to get the definitions data. Options are any defined in `get_bucket_name()`. Common options are `"mz"` and `"zm"`. | ||
#' @param station_id A character string specifying the ID of the station for which to get the definitions data. | ||
#' @param definitions_id A character string specifying the ID of the definitions for which to get the definitions data. If `NULL` this is found from the metadata. | ||
#' @param file Default `NULL` meaning that the most recent definitions file will be found and imported. Otherwise specify as a string the file to import. In format: "STATIONNAME.TIMESTAMP" e.g. "1.20240311152831" | ||
#' @param country A character vector of length 1 specifying the country from which to get the definitions data. | ||
#' Options depend on the implementation of `get_bucket_name()`, with common options being `"mz"` (Mozambique) | ||
#' and `"zm"` (Zambia). | ||
#' @param station_id A character vector specifying the ID(s) of the station(s) for which to get the definitions data. | ||
#' If `NULL`, data is fetched using the `definition_id`. Defaults to `NULL`. | ||
#' @param definition_id A character string specifying the ID of the definitions to retrieve. If `NULL` and `station_id` is provided, | ||
#' the most recent definitions ID is determined from metadata. Defaults to `NULL`. | ||
#' @param file A character string specifying the name of a specific file to import, in the format `"STATIONNAME.TIMESTAMP"`. | ||
#' If `NULL`, the most recent definitions file is fetched automatically. Defaults to `NULL`. | ||
#' | ||
#' @return A data frame containing daily data based on the station ID. | ||
#' @return A data frame or list containing the definitions data: | ||
#' - If `station_id` is provided, returns data specific to the station(s). | ||
#' - If `station_id` is `NULL`, returns data specific to the `definition_id`. | ||
#' | ||
#' @importFrom googleCloudStorageR gcs_list_objects gcs_get_object | ||
#' @importFrom jsonlite fromJSON | ||
#' | ||
#' @seealso | ||
#' \code{update_definitions_data} for updating definitions files. | ||
#' @details | ||
#' - When `station_id` is provided, the function fetches the corresponding definitions data for each station. | ||
#' - If `station_id` is `NULL`, the function directly retrieves data based on the provided `definition_id`. | ||
#' - The function uses Google Cloud Storage to retrieve the files, ensuring that the most recent versions are accessed when `file` is `NULL`. | ||
#' - For multiple stations, the function returns a combined data frame. | ||
#' | ||
#' @export | ||
#' | ||
#' @examples # todo | ||
get_definitions_data <- function(country, station_id, definitions_id = NULL, file = NULL) { | ||
get_definitions_data <- function(country, station_id = NULL, definition_id = NULL, file = NULL) { | ||
if (length(country) > 1) stop("'country' must be of length 1") | ||
station_id <- as.character(station_id) | ||
dfs <- vector("list", length(station_id)) | ||
names(dfs) <- station_id | ||
|
||
# for (i in seq_along(station_id)) { | ||
# f <- paste0(country, "/", "definitions", "/", station_id[i], ".json") | ||
# if (file.exists(f)) { | ||
# dfs[[i]] <- jsonlite::read_json(f) | ||
# } else { | ||
# f <- update_definitions_data(country, station_id[i]) | ||
# dfs[[i]] <- f #jsonlite::write_json(f) | ||
# } | ||
# } | ||
# TODO: set up so if (is.null(definitions_id)) runs station_data, otherwise we cll it through from params | ||
station_data <- station_metadata(country = country, station_id = station_id) | ||
definitions_id_list <- lapply(station_data$definitions_id, function(x) x[length(x)]) | ||
names(definitions_id_list) <- station_id | ||
|
||
if (is.null(file)){ | ||
if (is.null(station_id)){ | ||
# return the data for that definitions ID | ||
bucket_name <- get_bucket_name(country) | ||
for (i in seq_along(station_id)) { | ||
if (is.null(definitions_id)) definitions_id <- definitions_id_list[[i]] | ||
# List all files in the "definitions" directory for the station | ||
files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name, | ||
prefix = paste0("definitions/", definitions_id, "."), | ||
versions = TRUE) | ||
|
||
if (nrow(files) == 0) { stop("No files found. Check country and station_id")} | ||
# Filter files with the ".json" extension | ||
files <- files %>% dplyr::filter(grepl("\\.json$", name)) | ||
json_files <- files$name | ||
|
||
# Check if multiple json files found. If so, take hte most recent one. | ||
if (length(json_files) >= 1){ | ||
# Extract timestamps from file names | ||
definitions_id[i] <- extract_most_recent_json(json_files) | ||
} | ||
f <- paste0("definitions/", definitions_id[i], ".json") | ||
if (file.exists(f)) { | ||
dfs[[i]] <- jsonlite::read_json(f) | ||
} else { | ||
f <- update_definitions_data(country, definitions_id[i]) | ||
dfs[[i]] <- f #jsonlite::write_json(f) | ||
} | ||
} | ||
if (length(station_id) > 1) { | ||
station_data <- dplyr::bind_rows(dfs) | ||
|
||
files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name, | ||
prefix = paste0("definitions/", definition_id, "."), | ||
versions = TRUE) | ||
|
||
if (nrow(files) == 0) { stop("No files found. Check country and station_id")} | ||
# Filter files with the ".json" extension | ||
files <- files %>% dplyr::filter(grepl("\\.json$", name)) | ||
json_files <- files$name | ||
|
||
# Check if multiple json files found. If so, take hte most recent one. | ||
if (length(json_files) >= 1) definition_id <- extract_most_recent_json(json_files) | ||
f <- paste0("definitions/", definition_id, ".json") | ||
if (file.exists(f)) { | ||
definitions_data <- jsonlite::read_json(f) | ||
} else { | ||
station_data <- dfs[[1]] | ||
f <- update_definitions_data(country, definition_id) | ||
definitions_data <- f #jsonlite::write_json(f) | ||
} | ||
return(definitions_data) | ||
} else { | ||
f <- paste0("definitions/", file, ".json") | ||
if (file.exists(f)) { | ||
station_data <- jsonlite::read_json(f) | ||
station_id <- as.character(station_id) | ||
dfs <- vector("list", length(station_id)) | ||
names(dfs) <- station_id | ||
|
||
station_data <- station_metadata(country = country, station_id = station_id) | ||
definition_id_list <- lapply(station_data$definition_id, function(x) x[length(x)]) | ||
names(definition_id_list) <- station_id | ||
|
||
if (is.null(file)){ | ||
bucket_name <- get_bucket_name(country) | ||
for (i in seq_along(station_id)) { | ||
if (is.null(definition_id)) definition_id <- definition_id_list[[i]] | ||
# List all files in the "definitions" directory for the station | ||
files <- googleCloudStorageR::gcs_list_objects(bucket = bucket_name, | ||
prefix = paste0("definitions/", definition_id, "."), | ||
versions = TRUE) | ||
|
||
if (nrow(files) == 0) { stop("No files found. Check country and station_id")} | ||
# Filter files with the ".json" extension | ||
files <- files %>% dplyr::filter(grepl("\\.json$", name)) | ||
json_files <- files$name | ||
|
||
# Check if multiple json files found. If so, take hte most recent one. | ||
if (length(json_files) >= 1){ | ||
# Extract timestamps from file names | ||
definition_id[i] <- extract_most_recent_json(json_files) | ||
} | ||
f <- paste0("definitions/", definition_id[i], ".json") | ||
if (file.exists(f)) { | ||
dfs[[i]] <- jsonlite::read_json(f) | ||
} else { | ||
f <- update_definitions_data(country, definition_id[i]) | ||
dfs[[i]] <- f #jsonlite::write_json(f) | ||
} | ||
} | ||
if (length(station_id) > 1) { | ||
station_data <- dplyr::bind_rows(dfs) | ||
} else { | ||
station_data <- dfs[[1]] | ||
} | ||
} else { | ||
f <- update_definitions_data(country, file) | ||
station_data <- f | ||
f <- paste0("definitions/", file, ".json") | ||
if (file.exists(f)) { | ||
station_data <- jsonlite::read_json(f) | ||
} else { | ||
f <- update_definitions_data(country, file) | ||
station_data <- f | ||
} | ||
} | ||
return(station_data) | ||
} | ||
return(station_data) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
update_crop_success_probabilities_from_definition <- function(country, station_id = NULL, definitions_id = NULL, daily_data, summary_rains_data = NULL) { | ||
if (!is.null(station_id) & !is.null(definition_id)) warning("Both station_id and definition_id are given. Defaulting to station_id.") | ||
# Retrieve the most recent definition data for the specified country and station | ||
if (!is.null(station_id)){ | ||
definitions_data <- get_definitions_data(country = country, station_id = station_id) | ||
} else { | ||
definitions_data <- get_definitions_data(country = country, definition_id = definition_id) | ||
} | ||
|
||
# If start-of-rains data is not provided, compute it using daily rainfall data | ||
if (is.null(summary_rains_data)) { | ||
summaries <- "start_rains" | ||
if (!is.null(definitions_data$end_rains$start_day)){ | ||
summaries <- c(summaries, "end_rains") | ||
end_doy <- "end_rains_doy" | ||
} | ||
if (!is.null(definitions_data$end_season$start_day)){ | ||
summaries <- c(summaries, "end_season") | ||
end_doy <- "end_season_doy" | ||
} | ||
|
||
summary_rains_data <- update_rainfall_summaries_from_definition( | ||
country = country, | ||
station_id = station_id, | ||
daily_data = daily_data, | ||
summaries = summaries | ||
) | ||
|
||
# for end_rains - do we do end_rains by default then otherwise do end_season? | ||
# what if we have both end reains and end season? | ||
} | ||
|
||
# Extract the column names for the start-of-rains data | ||
data_names <- data_definitions(names(daily_data), FALSE, FALSE) | ||
|
||
# Retrieve the specified days for calculating season start probabilities | ||
water_requirements <- as.integer(definitions_data$crops_success$water_requirements) | ||
planting_dates <- as.integer(definitions_data$crops_success$planting_dates) | ||
planting_length <- as.integer(definitions_data$crops_success$planting_length) | ||
summary_data <- NULL | ||
if (length(water_requirements) == 0){ | ||
warning("No specified days given for water requirements. No updates required.") | ||
return(summary_data) | ||
} | ||
if (length(planting_dates) == 0){ | ||
warning("No specified days given for planting dates No updates required.") | ||
return(summary_data) | ||
} | ||
if (length(planting_length) == 0){ | ||
warning("No specified days given for planting length No updates required.") | ||
return(summary_data) | ||
} | ||
|
||
start_before_season <- definitions_data$crops_success$start_check | ||
if (is.null(start_before_season)) start_before_season <- FALSE | ||
|
||
# Calculate season start probabilities | ||
daily_data[[data_names$year]] <- as.factor(as.character(daily_data[[data_names$year]])) | ||
summary_rains_data[[data_names$year]] <- as.factor(as.character(summary_rains_data[[data_names$year]])) | ||
summary_data <- rpicsa::crops_definitions(data = daily_data, | ||
date_time = data_names$date, | ||
station = data_names$station, | ||
year = data_names$year, | ||
rain = data_names$rain, | ||
water_requirements = water_requirements, | ||
planting_dates = planting_dates, | ||
planting_length = planting_length, | ||
start_check = start_before_season, | ||
season_data = summary_rains_data, | ||
start_day = "start_rains_doy", | ||
end_day = end_doy) | ||
|
||
return(summary_data) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.