diff --git a/data-raw/prep_data.R b/data-raw/prep_data.R index c2af50a..276a3a5 100644 --- a/data-raw/prep_data.R +++ b/data-raw/prep_data.R @@ -1,27 +1,50 @@ # Load required libraries library(tidyverse) library(here) +library(yaml) # Define constants RAW_DATA_DIR <- here("data-raw") OUTPUT_DIR <- here("data") +config_path <- file.path(RAW_DATA_DIR, "config.yml") +all_config <- yaml.load_file(config_path) + # Function to process data for a single ecoregion -process_ecoregion_data <- function(ecoregion) { +process_ecoregion_data <- function(ecoregion, config) { # Read raw data files for the ecoregion - annex_data <- read.csv(file = paste0(RAW_DATA_DIR, "/", ecoregion, "/annex_table.csv")) + + config <- all_config[[ecoregion]] + if (is.null(config)) { + stop(paste("Configuration for ecoregion", ecoregion, "not found")) + } + + # Read raw data files for the ecoregion based on config + data_list <- lapply(names(config$files), function(file_key) { + file_path <- file.path(RAW_DATA_DIR, ecoregion, config$files[[file_key]]) + if (!file.exists(file_path)) { + warning(paste("File not found:", file_path)) + return(NULL) + } + read.csv(file_path) + }) + names(data_list) <- names(config$files) + +# Remove any NULL entries (files that weren't found) + data_list <- data_list[!sapply(data_list, is.null)] + # Process the data # Return a list of topic data frames - list(stock_annex_table = annex_data) + return(data_list) } # Get list of ecoregions (assuming each subdirectory in RAW_DATA_DIR is an ecoregion) ecoregions <- list.dirs(RAW_DATA_DIR, full.names = FALSE, recursive = FALSE) # Process data for each ecoregion -all_data <- map(ecoregions, process_ecoregion_data) +all_data <- map(ecoregions, process_ecoregion_data, all_config) names(all_data) <- ecoregions # Save processed data