Skip to content

Commit

Permalink
new function to visualise database content
Browse files Browse the repository at this point in the history
  • Loading branch information
EhrmannS committed Aug 11, 2024
1 parent 077dd32 commit 5c9ed81
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 109 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(adb_reset)
export(adb_restore)
export(adb_schemas)
export(adb_translations)
export(adb_visualise)
export(edit_matches)
export(matchOntology)
export(normGeometry)
Expand Down Expand Up @@ -124,6 +125,7 @@ importFrom(sf,st_is_valid)
importFrom(sf,st_join)
importFrom(sf,st_layers)
importFrom(sf,st_make_valid)
importFrom(sf,st_read)
importFrom(sf,st_sf)
importFrom(sf,st_transform)
importFrom(sf,st_write)
Expand Down
219 changes: 110 additions & 109 deletions R/adb_archive.R
Original file line number Diff line number Diff line change
@@ -1,109 +1,110 @@
#' Archive the data from an areal database
#'
#' @param pattern [\code{character(1)}]\cr a regular expression used to filter
#' files to load.
#' @param variables [\code{character(.)}]\cr columns, typically observed
#' variables, to select.
#' @param compress [\code{logical(1)}]\cr whether or not the database should be
#' compressed into a \emph{tar.gz} archive. Will delete the database folder in
#' \code{outPath}.
#' @param outPath [\code{character(1)}]\cr directory, where the archive should
#' be stored.
#' @details This function prepares and packages the data into an archiveable
#' form. This contains geopacakge files for geometries and csv files for all
#' tables, such as inventory, matching and thematic data tables.
#' @return no return value, called for the side-effect of creating a database
#' archive.
#' @importFrom checkmate assertCharacter assertLogical assertDirectoryExists
#' testDirectoryExists
#' @importFrom purrr map
#' @importFrom stringr str_split
#' @importFrom readr write_csv
#' @importFrom archive archive_write_dir
#' @importFrom utils capture.output sessionInfo tar
#' @export

adb_archive <- function(pattern = NULL, variables = NULL, compress = FALSE, outPath = NULL){

assertCharacter(x = pattern, len = 1, null.ok = TRUE)
assertCharacter(x = variables, any.missing = FALSE, null.ok = TRUE)
assertLogical(x = compress, len = 1, any.missing = FALSE)
assertDirectoryExists(x = outPath, access = "rw")

# set internal paths
intPaths <- paste0(getOption(x = "adb_path"))

# derive current version
load(paste0(intPaths, "/db_info.RData"))
version <- paste0(db_info$version, "_", format(Sys.Date(), "%Y%m%d"))

# derive path
archivePath <- paste0(outPath, "arealDB_", version, "/")
message("\n-> creating archive '", archivePath, "'")
dir.create(path = archivePath)
dir.create(path = paste0(archivePath, "geometries"))
dir.create(path = paste0(archivePath, "tables"))
dir.create(path = paste0(archivePath, "meta"))

message("-> archiving tables")
stage3tables_full <- list.files(path = paste0(intPaths, "/tables/stage3"), full.names = TRUE)
if(length(stage3tables_full) != 0){

stage3tables <- map(.x = list.files(path = paste0(intPaths, "/tables/stage3")), .f = function(ix){
temp <- str_split(ix, "[.]")[[1]][1]
}) |>
unlist()

pb <- progress_bar$new(format = "[:bar] :current/:total (:percent)", total = length(stage3tables))
for(i in seq_along(stage3tables)){
temp <- readRDS(file = stage3tables_full[i])
write_csv(x = temp, file = paste0(archivePath, "tables/", stage3tables[i], ".csv"), na = "")
pb$tick()
}

}

message("-> archiving geometries")
stage3geometries_full <- list.files(path = paste0(intPaths, "/geometries/stage3"), full.names = TRUE)
if(length(stage3geometries_full) != 0){

stage3geometries <- list.files(path = paste0(intPaths, "/geometries/stage3"))
file.copy(from = stage3geometries_full,
to = paste0(archivePath, "geometries/", stage3geometries),
overwrite = TRUE)

}

message("-> archiving inventory tables")
adb_inventory(type = "dataseries") |>
write_csv(file = paste0(archivePath, "meta/inv_dataseries.csv"), na = "")
adb_inventory(type = "geometries") |>
write_csv(file = paste0(archivePath, "meta/inv_geometries.csv"), na = "")
adb_inventory(type = "tables") |>
write_csv(file = paste0(archivePath, "meta/inv_tables.csv"), na = "")
adb_ontology(type = "ontology") |>
write_csv(file = paste0(archivePath, "meta/ontology.csv"), na = "")
adb_ontology(type = "gazetteer") |>
write_csv(file = paste0(archivePath, "meta/gazetteer.csv"), na = "")

message("-> archiving metadata")
sI <- sessionInfo()
save(sI, file = paste0(archivePath, "R_sessionInfo.RData"))
write_lines(x = capture.output(sI), file = paste0(archivePath, "R_sessionInfo.txt"))
save(db_info, file = paste0(archivePath, "dbInfo.RData"))
db_desc_lines <- paste0("version:\n", db_info$version, "\n\n",
"authors:\n", paste0("creator: ", paste0(db_info$author$cre, collapse = ", "), "\nauthor: ", paste0(db_info$author$aut, collapse = ", "), "\ncontributor: ", paste0(db_info$author$ctb, collapse = ", ")), "\n\n",
"licence:\n", db_info$licence, "\n\n",
"gazetteer:\n", db_info$gazetteer, "\n\n", # these two should presumably be replaced with a version label as well
"ontology:\n", unique(db_info$ontology), "\n\n", # these two should presumably be replaced with a version label as well
"variables:\n", paste0(db_info$variables, collapse = ", "), "\n\n")
write_lines(x = db_desc_lines, file = paste0(archivePath, "dbInfo.txt"))

if(compress){
message("-> compressing database archive")
archive_write_dir(archive = paste0(outPath, "arealDB_", version, ".7z"),
dir = archivePath, format = "7zip")
unlink(archivePath, recursive = TRUE)
}

}
#' Archive the data from an areal database
#'
#' @param pattern [\code{character(1)}]\cr a regular expression used to filter
#' files to load.
#' @param variables [\code{character(.)}]\cr columns, typically observed
#' variables, to select.
#' @param compress [\code{logical(1)}]\cr whether or not the database should be
#' compressed into a \emph{tar.gz} archive. Will delete the database folder in
#' \code{outPath}.
#' @param outPath [\code{character(1)}]\cr directory, where the archive should
#' be stored.
#' @details This function prepares and packages the data into an archiveable
#' form. This contains geopacakge files for geometries and csv files for all
#' tables, such as inventory, matching and thematic data tables.
#' @return no return value, called for the side-effect of creating a database
#' archive.
#' @importFrom checkmate assertCharacter assertLogical assertDirectoryExists
#' testDirectoryExists
#' @importFrom purrr map
#' @importFrom stringr str_split
#' @importFrom readr write_csv
#' @importFrom archive archive_write_dir
#' @importFrom utils capture.output sessionInfo tar
#' @export

adb_archive <- function(pattern = NULL, variables = NULL, compress = FALSE,
outPath = NULL){

assertCharacter(x = pattern, len = 1, null.ok = TRUE)
assertCharacter(x = variables, any.missing = FALSE, null.ok = TRUE)
assertLogical(x = compress, len = 1, any.missing = FALSE)
assertDirectoryExists(x = outPath, access = "rw")

# set internal paths
intPaths <- paste0(getOption(x = "adb_path"))

# derive current version
load(paste0(intPaths, "/db_info.RData"))
version <- paste0(db_info$version, "_", format(Sys.Date(), "%Y%m%d"))

# derive path
archivePath <- paste0(outPath, "arealDB_", version, "/")
message("\n-> creating archive '", archivePath, "'")
dir.create(path = archivePath)
dir.create(path = paste0(archivePath, "geometries"))
dir.create(path = paste0(archivePath, "tables"))
dir.create(path = paste0(archivePath, "meta"))

message("-> archiving tables")
stage3tables_full <- list.files(path = paste0(intPaths, "/tables/stage3"), full.names = TRUE)
if(length(stage3tables_full) != 0){

stage3tables <- map(.x = list.files(path = paste0(intPaths, "/tables/stage3")), .f = function(ix){
temp <- str_split(ix, "[.]")[[1]][1]
}) |>
unlist()

pb <- progress_bar$new(format = "[:bar] :current/:total (:percent)", total = length(stage3tables))
for(i in seq_along(stage3tables)){
temp <- readRDS(file = stage3tables_full[i])
write_csv(x = temp, file = paste0(archivePath, "tables/", stage3tables[i], ".csv"), na = "")
pb$tick()
}

}

message("-> archiving geometries")
stage3geometries_full <- list.files(path = paste0(intPaths, "/geometries/stage3"), full.names = TRUE)
if(length(stage3geometries_full) != 0){

stage3geometries <- list.files(path = paste0(intPaths, "/geometries/stage3"))
file.copy(from = stage3geometries_full,
to = paste0(archivePath, "geometries/", stage3geometries),
overwrite = TRUE)

}

message("-> archiving inventory tables")
adb_inventory(type = "dataseries") |>
write_csv(file = paste0(archivePath, "meta/inv_dataseries.csv"), na = "")
adb_inventory(type = "geometries") |>
write_csv(file = paste0(archivePath, "meta/inv_geometries.csv"), na = "")
adb_inventory(type = "tables") |>
write_csv(file = paste0(archivePath, "meta/inv_tables.csv"), na = "")
adb_ontology(type = "ontology") |>
write_csv(file = paste0(archivePath, "meta/ontology.csv"), na = "")
adb_ontology(type = "gazetteer") |>
write_csv(file = paste0(archivePath, "meta/gazetteer.csv"), na = "")

message("-> archiving metadata")
sI <- sessionInfo()
save(sI, file = paste0(archivePath, "R_sessionInfo.RData"))
write_lines(x = capture.output(sI), file = paste0(archivePath, "R_sessionInfo.txt"))
save(db_info, file = paste0(archivePath, "dbInfo.RData"))
db_desc_lines <- paste0("version:\n", db_info$version, "\n\n",
"authors:\n", paste0("creator: ", paste0(db_info$author$cre, collapse = ", "), "\nauthor: ", paste0(db_info$author$aut, collapse = ", "), "\ncontributor: ", paste0(db_info$author$ctb, collapse = ", ")), "\n\n",
"licence:\n", db_info$licence, "\n\n",
"gazetteer:\n", db_info$gazetteer, "\n\n", # these two should presumably be replaced with a version label as well
"ontology:\n", unique(db_info$ontology), "\n\n", # these two should presumably be replaced with a version label as well
"variables:\n", paste0(db_info$variables, collapse = ", "), "\n\n")
write_lines(x = db_desc_lines, file = paste0(archivePath, "dbInfo.txt"))

if(compress){
message("-> compressing database archive")
archive_write_dir(archive = paste0(outPath, "arealDB_", version, ".7z"),
dir = archivePath, format = "7zip")
unlink(archivePath, recursive = TRUE)
}

}
142 changes: 142 additions & 0 deletions R/adb_visualise.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#' Visualise database contents
#'
#' @param ... combination of column name in the ontology and value to filter
#' that column by to build a tree of the concepts nested into it; see
#' \code{\link[ontologics]{make_tree}}.
#' @param territory description
#' @param level description
#' @param year description
#' @param diagnose description
#' @return returns ...
#' @importFrom dplyr mutate select
#' @importFrom sf st_read
#' @export

adb_visualise <- function(..., territory = NULL, level = NULL, year = NULL,
diagnose = FALSE){

# territory <- list(al1 = "Brazil"); level = "al3"; year = 2010; diagnose = FALSE

assertList(x = territory, types = "character", any.missing = FALSE, null.ok = TRUE)
assertCharacter(x = level, len = 1, any.missing = FALSE, null.ok = TRUE)
assertIntegerish(x = year, min.len = 1, any.missing = FALSE, null.ok = TRUE)
assertLogical(x = diagnose, len = 1, all.missing = FALSE)

adb_path <- getOption(x = "adb_path")
inv <- readRDS(paste0(adb_path, "/_meta/inventory.rds"))

gazPath <- paste0(getOption(x = "gazetteer_path"))
gazClasses <- get_class(ontology = gazPath)
topClass <- paste0(getOption(x = "gazetteer_top"))

inv_tables <- inv$tables
inv_geoms <- inv$geometries
inv_series <- inv$dataseries

tables <- list.files(path = paste0(adb_path, "/tables/stage3"), full.names = TRUE)
geometries <- list.files(path = paste0(adb_path, "/geometries/stage3"), full.names = TRUE)

# first, get all items down until the "topClass"
allItems <- make_tree(class = topClass, reverse = TRUE, ontology = gazPath)
top <- get_concept(class = names(territory), label = territory[[1]], ontology = gazPath) %>%
pull(id)

# then select those that are determined by "broadest"
fin <- NULL
outIDs <- top
while(is.null(fin)){
childID <- allItems %>%
filter(has_broader %in% top) %>%
pull(id)
if(length(childID) != 0){
top <- childID
outIDs <- c(outIDs, childID)
} else {
fin <- TRUE
}
}

tableNames <- allItems |>
filter(id %in% outIDs & class == "al1")

for(i in seq_along(tableNames$label)){

nation <- tableNames$label[i]

table <- readRDS(file = tables[str_detect(string = tables, pattern = paste0(nation, ".rds"))])

# build metadata
tabIDs <- table |>
distinct(tabID)

temp_inv_tables <- inv_tables |>
filter(tabID %in% tabIDs$tabID)

if(is.null(level)){
tempLvls <- temp_inv_tables |>
distinct(level) |>
pull()
} else {
assertSubset(x = level, choices = unique(temp_inv_tables$level))
tempLvls <- level
}

geometry <- st_read(dsn = geometries[str_detect(string = geometries, pattern = paste0(nation, ".gpkg"))],
layer = max(tempLvls))

diag <- list()
if(diagnose){

# visually diagnose some missing data
diag$gazID <- table |>
filter(is.na(gazID)) |>
distinct(gazMatch)
diag$ontoID <- table |>
filter(is.na(ontoID)) |>
distinct(ontoMatch)

} else {
diag <- NULL
}

meta <- table |>
group_by(gazID, geoID) |>
summarise(sources = n_distinct(tabID),
animals = n_distinct(animal),
years = n_distinct(year),
min_year = min(year),
max_year = max(year)) |>
ungroup()

# summarise variables
var <- table |>
filter(!is.na(gazID)) |>
filter(year == taregetYear) |>
filter(str_detect(ontoMatch, "close")) |>
pivot_wider(id_cols = c(gazID), names_from = animal, values_from = number_heads, values_fn = mean)

# put it together
full <- geometry |>
filter(geoID %in% temp_inv_tables$geoID) |>
left_join(meta, by = c("gazID", "geoID")) |>
left_join(var, by = "gazID")

thisGeoID <- full |>
as_tibble() |>
distinct(geoID)

temp_inv_geoms <- inv_geoms |>
filter(geoID %in% thisGeoID$geoID)
temp_inv_series <- inv_series |>
filter(datID %in% temp_inv_geoms$datID)




}



return(diag)

}
Loading

0 comments on commit 5c9ed81

Please sign in to comment.