Skip to content

Commit

Permalink
Merge updates from develop
Browse files Browse the repository at this point in the history
  • Loading branch information
hillarymarler committed Nov 7, 2024
1 parent a2ebe60 commit 6ea8b11
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 79 deletions.
159 changes: 80 additions & 79 deletions R/Utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -807,20 +807,20 @@ TADA_FormatDelimitedString <- function(delimited_string, delimiter = ",") {
#'
#' @export
#'
TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random", org_hierarchy = "none") {

TADA_FindNearbySites <- function(.data, dist_buffer = 100) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")

# .data required columns
required_cols <- c("TADA.MonitoringLocationIdentifier", "TADA.MonitoringLocationName",
"TADA.LongitudeMeasure", "TADA.LatitudeMeasure",
"TADA.MonitoringLocationTypeName")
required_cols <- c("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure")
# check .data has required columns
TADA_CheckColumns(.data, required_cols)

# create spatial dataset based on sites
data_sf <- unique(.data[, c("TADA.MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure")])
data_sf <- .data %>%
dplyr::select("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure") %>%
unique()

# convert to sf object
data_sf <- sf::st_as_sf(data_sf,
coords = c("TADA.LongitudeMeasure", "TADA.LatitudeMeasure"),
Expand All @@ -830,11 +830,8 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
# create a distance matrix in meters
dist.mat <- data.frame(sf::st_distance(data_sf)) # Great Circle distance since in lat/lon

# need to deal with duplicate row names (if this function has already been run)
# maybe append the original monitoring location identifier and remove it later in the function
# before using to name anything? (HRM Note: 9/6/24)
row.names(dist.mat) <- data_sf$TADA.MonitoringLocationIdentifier
colnames(dist.mat) <- data_sf$TADA.MonitoringLocationIdentifier
row.names(dist.mat) <- data_sf$MonitoringLocationIdentifier
colnames(dist.mat) <- data_sf$MonitoringLocationIdentifier

# convert distances to those within buffer (1) and beyond buffer (0)
dist.mat1 <- apply(dist.mat, c(1, 2), function(x) {
Expand All @@ -845,77 +842,81 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
}
})

# try purrr approach to extract site groups
library(igraph)

matrix_graph <- igraph::graph_from_adjacency_matrix(dist.mat1, mode = "undirected", diag = FALSE)

comps <- igraph::components(matrix_graph)
# create empty dataframe for groups
groups <- data.frame()

site_groups <- purrr::map(unique(comps$membership), function(x) {
sites <- which(comps$membership == x)
if(length(sites) > 1) {
data.frame(
TADA.MonitoringLocationIdentifier = .data$TADA.MonitoringLocationIdentifier[sites],
TADA.MonitoringLocationIdentifier.New = TADA_FormatDelimitedString(paste(.data$TADA.MonitoringLocationIdentifier[sites], collapse = ","))
)
} else {
NULL
# loop through distance matrix and extract site groups that are within the buffer distance from one another
for (i in 1:dim(dist.mat1)[1]) {
fsite <- rownames(dist.mat1)[i] # focal site
dat <- data.frame(Count = dist.mat1[i, ]) # get focal site count row as a column
dat$MonitoringLocationIdentifier <- colnames(dist.mat1) # give df site names along with counts
sites <- dat$MonitoringLocationIdentifier[dat$Count == 1] # filter to sites within buffer
sites1 <- sites[!sites %in% fsite] # get site list within buffer that does not include focal site
if (length(sites1) > 0) { # if this list is greater than 0, combine sites within buffer into data frame
df <- data.frame(MonitoringLocationIdentifier = sites, TADA.MonitoringLocationIdentifier = paste0(sites, collapse = ","))
df[c("TADA.MonitoringLocationIdentifier")] <- lapply(df[c("TADA.MonitoringLocationIdentifier")], TADA_FormatDelimitedString)
groups <- plyr::rbind.fill(groups, df)
}
})
}

group_df <- dplyr::bind_rows(site_groups) %>%
dplyr::distinct()
# get unique groups (since represented multiple times for each site looped through, above)
groups <- unique(groups)

# find any sites within multiple groups
summ_sites <- group_df %>%
dplyr::distinct() %>%
if (dim(groups)[1] > 0) {
# create group id numbers
group_ids <- groups %>%
dplyr::group_by(TADA.MonitoringLocationIdentifier) %>%
dplyr::mutate(GroupCount = 1:length(TADA.MonitoringLocationIdentifier))
dplyr::mutate(TADA.SiteGroup = dplyr::cur_group_id()) %>%
dplyr::ungroup() %>%
dplyr::group_by(MonitoringLocationIdentifier) %>%
dplyr::mutate(TADA.MonitoringLocationIdentifier = paste(TADA.MonitoringLocationIdentifier, collapse = ","),
TADA.SiteGroup = paste(TADA.SiteGroup, collapse = ",")) %>%
dplyr::distinct() %>%
dplyr::ungroup()

# find any sites within multiple groups
summ_sites <- group_ids %>%
dplyr::group_by(MonitoringLocationIdentifier) %>%
dplyr::mutate(GroupCount = 1:length(MonitoringLocationIdentifier))

# pivot wider if a site belongs to multiple groups
groups_wide <- merge(group_df, summ_sites, all.x = TRUE)
groups_wide <- tidyr::pivot_wider(groups_wide, id_cols = "TADA.MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.MonitoringLocationIdentifier.New", values_from = "TADA.MonitoringLocationIdentifier.New")
groups_prep <- merge(group_ids, summ_sites, all.x = TRUE)
groups_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.MonitoringLocationIdentifier", values_from = "TADA.MonitoringLocationIdentifier")
ids_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.SiteGroup", values_from = "TADA.SiteGroup")
# merge data to site groupings
.data <- merge(.data, groups_wide, all.x = TRUE)
.data <- merge(.data, ids_wide, all.x = TRUE)

# concatenate and move site id cols to right place
grpcols <- names(.data)[grepl("TADA.MonitoringLocationIdentifier.New1", names(.data))]
grpcols <- names(.data)[grepl("TADA.MonitoringLocationIdentifier", names(.data))]
idcols <- names (.data)[grepl("TADA.SiteGroup", names(.data))]

.data <- .data %>%
tidyr::unite(col = TADA.MonitoringLocationIdentifier.New, dplyr::all_of(grpcols), sep = ", ", na.rm = TRUE) %>%
dplyr::mutate(TADA.MonitoringLocationIdentifier = ifelse(!is.na(TADA.MonitoringLocationIdentifier.New),
TADA.MonitoringLocationIdentifier.New, TADA.MonitoringLocationIdentifier))

.data <- .data %>%
tidyr::unite(col = TADA.MonitoringLocationIdentifier.New1, dplyr::all_of(grpcols), sep = ", ", na.rm = TRUE)

# filter for grouped sites with new grouped TADA.MonitoringLocationIdentifiers (in TADA.MonitoringLocaitonIdentifier.New column)
grouped_sites <- .data %>%
dplyr::filter(!TADA.MonitoringLocationIdentifier.New1 == "")

# create list of grouped result identifiers
grouped_resultids <- grouped_sites$ResultIdentifier
tidyr::unite(col = TADA.SiteGroup, dplyr::all_of(idcols), sep = ", ", na.rm = TRUE) %>%
dplyr::mutate(TADA.SiteGroup = ifelse(TADA.SiteGroup == "", "No nearby sites", TADA.SiteGroup))
}

# filter by org hierarchy if user supplies one
if (!any(org_hierarchy == "none")) { # if there is an org hierarchy, use that to pick result with lowest rank in hierarchy
data_orgs <- unique(.data$OrganizationIdentifier)
if (any(!org_hierarchy %in% data_orgs)) {
print("One or more organizations in input hierarchy are not present in the input dataset.")
}

hierarchy_df <- data.frame("OrganizationIdentifier" = org_hierarchy, "rank" = 1:length(org_hierarchy))

grouped_sites <- grouped_sites %>%
dplyr::left_join(hierarchy_df, by = dplyr::join_by(OrganizationIdentifier)) %>%
dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure, OrganizationIdentifier,
TADA.MonitoringLocationTypeName, rank) %>%
dplyr::distinct() %>%
dplyr::slice_min(rank)

if (dim(groups)[1] == 0) { # #if no groups, give a TADA.MonitoringLocationIdentifier column filled with NA
print("No nearby sites detected using input buffer distance.")
}

# order columns
if ("ResultIdentifier" %in% names(.data)) {
.data <- TADA_OrderCols(.data)
}

return(.data)
}

# select and assign metadata randomly for grouped sites when meta_select equals "random"

if(meta_select == "random") {

select_meta <- grouped_sites %>%
dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName) %>%
Expand All @@ -929,9 +930,9 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected randomly")

}

if(meta_select == "oldest") {

select_meta <- grouped_sites %>%
dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName,
Expand All @@ -945,11 +946,11 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName) %>%
dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from the oldest result available.")

}

if(meta_select == "newest") {

select_meta <- grouped_sites %>%
dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName,
Expand All @@ -963,16 +964,16 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName) %>%
dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from the newest result available.")

}

if(meta_select == "count") {

select_meta <- grouped_sites %>%
dplyr::group_by(TADA.MonitoringLocationIdentifier.New1) %>%
dplyr::mutate(NCount = length(TADA.ResultMeasureValue)) %>%
dplyr::select(TADA.MonitoringLocationIdentifier, TADA.MonitoringLocationName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName,
NCount) %>%
dplyr::distinct() %>%
dplyr::group_by(TADA.MonitoringLocationIdentifier) %>%
Expand All @@ -985,13 +986,13 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from the newest result available.")

}

.data <- .data %>%
dplyr::full_join(select_meta, by = dplyr::join_by(TADA.MonitoringLocationIdentifier.New1))

.data <- .data %>%
dplyr::ungroup() %>%
dplyr::mutate(TADA.MonitoringLocationName = ifelse(!ResultIdentifier %in% grouped_resultids,
dplyr::mutate(TADA.MonitoringLocationName = ifelse(!ResultIdentifier %in% grouped_resultids,
TADA.MonitoringLocationName, TADA.MonitoringLocationName.New),
TADA.LatitudeMeasure = ifelse(!ResultIdentifier %in% grouped_resultids,
TADA.LatitudeMeasure, TADA.LatitudeMeasure.New),
Expand All @@ -1005,15 +1006,15 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random
-TADA.LatitudeMeasure.New, -TADA.LongitudeMeasure.New,
-TADA.MonitoringLocationTypeName.New)
}

if (dim(groups)[1] == 0) {

print("No nearby sites detected using input buffer distance.")

.data <- .data %>%
dplyr::mutate(TADA.NearbySites.Flag = "No nearby sites detected using input buffer distance.")
}

.data <- TADA_OrderCols(.data) %>%
dplyr::mutate(TADA.NearbySites.Flag = ifelse(is.na(TADA.NearbySites.Flag),
"No nearby sites detected using input buffer distance.",
Expand Down
100 changes: 100 additions & 0 deletions R/test.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
TADA_FindNearbySites <- function(.data, dist_buffer = 100) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")

# .data required columns
required_cols <- c("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure")
# check .data has required columns
TADA_CheckColumns(.data, required_cols)

# create spatial dataset based on sites
data_sf <- .data %>%
dplyr::select("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure") %>%
unique()

# convert to sf object
data_sf <- sf::st_as_sf(data_sf,
coords = c("TADA.LongitudeMeasure", "TADA.LatitudeMeasure"),
# Change to your CRS
crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
)
# create a distance matrix in meters
dist.mat <- data.frame(sf::st_distance(data_sf)) # Great Circle distance since in lat/lon

row.names(dist.mat) <- data_sf$MonitoringLocationIdentifier
colnames(dist.mat) <- data_sf$MonitoringLocationIdentifier

# convert distances to those within buffer (1) and beyond buffer (0)
dist.mat1 <- apply(dist.mat, c(1, 2), function(x) {
if (x <= dist_buffer) {
x <- 1
} else {
x <- 0
}
})

# create empty dataframe for groups
groups <- data.frame()

# loop through distance matrix and extract site groups that are within the buffer distance from one another
for (i in 1:dim(dist.mat1)[1]) {
fsite <- rownames(dist.mat1)[i] # focal site
dat <- data.frame(Count = dist.mat1[i, ]) # get focal site count row as a column
dat$MonitoringLocationIdentifier <- colnames(dist.mat1) # give df site names along with counts
sites <- dat$MonitoringLocationIdentifier[dat$Count == 1] # filter to sites within buffer
sites1 <- sites[!sites %in% fsite] # get site list within buffer that does not include focal site
if (length(sites1) > 0) { # if this list is greater than 0, combine sites within buffer into data frame
df <- data.frame(MonitoringLocationIdentifier = sites, TADA.MonitoringLocationIdentifier = paste0(sites, collapse = ","))
df[c("TADA.MonitoringLocationIdentifier")] <- lapply(df[c("TADA.MonitoringLocationIdentifier")], TADA_FormatDelimitedString)
groups <- plyr::rbind.fill(groups, df)
}
}

# get unique groups (since represented multiple times for each site looped through, above)
groups <- unique(groups)

if (dim(groups)[1] > 0) {
# create group id numbers
group_ids <- groups %>%
dplyr::group_by(TADA.MonitoringLocationIdentifier) %>%
dplyr::mutate(TADA.SiteGroup = dplyr::cur_group_id()) %>%
dplyr::ungroup() %>%
dplyr::group_by(MonitoringLocationIdentifier) %>%
dplyr::mutate(TADA.MonitoringLocationIdentifier = paste(TADA.MonitoringLocationIdentifier, collapse = ","),
TADA.SiteGroup = paste(TADA.SiteGroup, collapse = ",")) %>%
dplyr::distinct() %>%
dplyr::ungroup()
}

# find any sites within multiple groups
summ_sites <- group_ids %>%
dplyr::group_by(MonitoringLocationIdentifier) %>%
dplyr::mutate(GroupCount = 1:length(MonitoringLocationIdentifier))

# pivot wider if a site belongs to multiple groups
groups_prep <- merge(group_ids, summ_sites, all.x = TRUE)
groups_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.MonitoringLocationIdentifier", values_from = "TADA.MonitoringLocationIdentifier")
ids_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.SiteGroup", values_from = "TADA.SiteGroup")
# merge data to site groupings
.data <- merge(.data, groups_wide, all.x = TRUE)
.data <- merge(.data, ids_wide, all.x = TRUE)

# concatenate and move site id cols to right place
grpcols <- names(.data)[grepl("TADA.MonitoringLocationIdentifier", names(.data))]
idcols <- names (.data)[grepl("TADA.SIteGroup", names(.data))]

.data <- .data %>% tidyr::unite(col = TADA.MonitoringLocationIdentifier.New, dplyr::all_of(grpcols), sep = ", ", na.rm = TRUE)
.data <- .data %>% tidyr::unite(col = TADA.SiteGroup, dplyr::all_of(idcols), sep = ", ", na.rm = TRUE)

if (dim(groups)[1] == 0) { # #if no groups, give a TADA.MonitoringLocationIdentifier column filled with NA
print("No nearby sites detected using input buffer distance.")
}

# order columns
if ("ResultIdentifier" %in% names(.data)) {
.data <- TADA_OrderCols(.data)
}

return(.data)
}

0 comments on commit 6ea8b11

Please sign in to comment.