diff --git a/R/Utilities.R b/R/Utilities.R index 676fa405..8f24bc2d 100644 --- a/R/Utilities.R +++ b/R/Utilities.R @@ -807,20 +807,20 @@ TADA_FormatDelimitedString <- function(delimited_string, delimiter = ",") { #' #' @export #' -TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random", org_hierarchy = "none") { - +TADA_FindNearbySites <- function(.data, dist_buffer = 100) { # check .data is data.frame TADA_CheckType(.data, "data.frame", "Input object") # .data required columns - required_cols <- c("TADA.MonitoringLocationIdentifier", "TADA.MonitoringLocationName", - "TADA.LongitudeMeasure", "TADA.LatitudeMeasure", - "TADA.MonitoringLocationTypeName") + required_cols <- c("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure") # check .data has required columns TADA_CheckColumns(.data, required_cols) # create spatial dataset based on sites - data_sf <- unique(.data[, c("TADA.MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure")]) + data_sf <- .data %>% + dplyr::select("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure") %>% + unique() + # convert to sf object data_sf <- sf::st_as_sf(data_sf, coords = c("TADA.LongitudeMeasure", "TADA.LatitudeMeasure"), @@ -830,11 +830,8 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random # create a distance matrix in meters dist.mat <- data.frame(sf::st_distance(data_sf)) # Great Circle distance since in lat/lon - # need to deal with duplicate row names (if this function has already been run) - # maybe append the original monitoring location identifier and remove it later in the function - # before using to name anything? (HRM Note: 9/6/24) - row.names(dist.mat) <- data_sf$TADA.MonitoringLocationIdentifier - colnames(dist.mat) <- data_sf$TADA.MonitoringLocationIdentifier + row.names(dist.mat) <- data_sf$MonitoringLocationIdentifier + colnames(dist.mat) <- data_sf$MonitoringLocationIdentifier # convert distances to those within buffer (1) and beyond buffer (0) dist.mat1 <- apply(dist.mat, c(1, 2), function(x) { @@ -845,77 +842,81 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random } }) - # try purrr approach to extract site groups - library(igraph) - - matrix_graph <- igraph::graph_from_adjacency_matrix(dist.mat1, mode = "undirected", diag = FALSE) - - comps <- igraph::components(matrix_graph) + # create empty dataframe for groups + groups <- data.frame() - site_groups <- purrr::map(unique(comps$membership), function(x) { - sites <- which(comps$membership == x) - if(length(sites) > 1) { - data.frame( - TADA.MonitoringLocationIdentifier = .data$TADA.MonitoringLocationIdentifier[sites], - TADA.MonitoringLocationIdentifier.New = TADA_FormatDelimitedString(paste(.data$TADA.MonitoringLocationIdentifier[sites], collapse = ",")) - ) - } else { - NULL + # loop through distance matrix and extract site groups that are within the buffer distance from one another + for (i in 1:dim(dist.mat1)[1]) { + fsite <- rownames(dist.mat1)[i] # focal site + dat <- data.frame(Count = dist.mat1[i, ]) # get focal site count row as a column + dat$MonitoringLocationIdentifier <- colnames(dist.mat1) # give df site names along with counts + sites <- dat$MonitoringLocationIdentifier[dat$Count == 1] # filter to sites within buffer + sites1 <- sites[!sites %in% fsite] # get site list within buffer that does not include focal site + if (length(sites1) > 0) { # if this list is greater than 0, combine sites within buffer into data frame + df <- data.frame(MonitoringLocationIdentifier = sites, TADA.MonitoringLocationIdentifier = paste0(sites, collapse = ",")) + df[c("TADA.MonitoringLocationIdentifier")] <- lapply(df[c("TADA.MonitoringLocationIdentifier")], TADA_FormatDelimitedString) + groups <- plyr::rbind.fill(groups, df) } - }) + } - group_df <- dplyr::bind_rows(site_groups) %>% - dplyr::distinct() + # get unique groups (since represented multiple times for each site looped through, above) + groups <- unique(groups) -# find any sites within multiple groups - summ_sites <- group_df %>% - dplyr::distinct() %>% + if (dim(groups)[1] > 0) { + # create group id numbers + group_ids <- groups %>% dplyr::group_by(TADA.MonitoringLocationIdentifier) %>% - dplyr::mutate(GroupCount = 1:length(TADA.MonitoringLocationIdentifier)) + dplyr::mutate(TADA.SiteGroup = dplyr::cur_group_id()) %>% + dplyr::ungroup() %>% + dplyr::group_by(MonitoringLocationIdentifier) %>% + dplyr::mutate(TADA.MonitoringLocationIdentifier = paste(TADA.MonitoringLocationIdentifier, collapse = ","), + TADA.SiteGroup = paste(TADA.SiteGroup, collapse = ",")) %>% + dplyr::distinct() %>% + dplyr::ungroup() + + # find any sites within multiple groups + summ_sites <- group_ids %>% + dplyr::group_by(MonitoringLocationIdentifier) %>% + dplyr::mutate(GroupCount = 1:length(MonitoringLocationIdentifier)) # pivot wider if a site belongs to multiple groups - groups_wide <- merge(group_df, summ_sites, all.x = TRUE) - groups_wide <- tidyr::pivot_wider(groups_wide, id_cols = "TADA.MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.MonitoringLocationIdentifier.New", values_from = "TADA.MonitoringLocationIdentifier.New") + groups_prep <- merge(group_ids, summ_sites, all.x = TRUE) + groups_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.MonitoringLocationIdentifier", values_from = "TADA.MonitoringLocationIdentifier") + ids_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.SiteGroup", values_from = "TADA.SiteGroup") # merge data to site groupings .data <- merge(.data, groups_wide, all.x = TRUE) + .data <- merge(.data, ids_wide, all.x = TRUE) # concatenate and move site id cols to right place - grpcols <- names(.data)[grepl("TADA.MonitoringLocationIdentifier.New1", names(.data))] + grpcols <- names(.data)[grepl("TADA.MonitoringLocationIdentifier", names(.data))] + idcols <- names (.data)[grepl("TADA.SiteGroup", names(.data))] + .data <- .data %>% + tidyr::unite(col = TADA.MonitoringLocationIdentifier.New, dplyr::all_of(grpcols), sep = ", ", na.rm = TRUE) %>% + dplyr::mutate(TADA.MonitoringLocationIdentifier = ifelse(!is.na(TADA.MonitoringLocationIdentifier.New), + TADA.MonitoringLocationIdentifier.New, TADA.MonitoringLocationIdentifier)) + .data <- .data %>% - tidyr::unite(col = TADA.MonitoringLocationIdentifier.New1, dplyr::all_of(grpcols), sep = ", ", na.rm = TRUE) - - # filter for grouped sites with new grouped TADA.MonitoringLocationIdentifiers (in TADA.MonitoringLocaitonIdentifier.New column) - grouped_sites <- .data %>% - dplyr::filter(!TADA.MonitoringLocationIdentifier.New1 == "") - - # create list of grouped result identifiers - grouped_resultids <- grouped_sites$ResultIdentifier + tidyr::unite(col = TADA.SiteGroup, dplyr::all_of(idcols), sep = ", ", na.rm = TRUE) %>% + dplyr::mutate(TADA.SiteGroup = ifelse(TADA.SiteGroup == "", "No nearby sites", TADA.SiteGroup)) + } - # filter by org hierarchy if user supplies one - if (!any(org_hierarchy == "none")) { # if there is an org hierarchy, use that to pick result with lowest rank in hierarchy - data_orgs <- unique(.data$OrganizationIdentifier) - if (any(!org_hierarchy %in% data_orgs)) { - print("One or more organizations in input hierarchy are not present in the input dataset.") - } - - hierarchy_df <- data.frame("OrganizationIdentifier" = org_hierarchy, "rank" = 1:length(org_hierarchy)) - - grouped_sites <- grouped_sites %>% - dplyr::left_join(hierarchy_df, by = dplyr::join_by(OrganizationIdentifier)) %>% - dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName, - TADA.LatitudeMeasure, TADA.LongitudeMeasure, OrganizationIdentifier, - TADA.MonitoringLocationTypeName, rank) %>% - dplyr::distinct() %>% - dplyr::slice_min(rank) - + if (dim(groups)[1] == 0) { # #if no groups, give a TADA.MonitoringLocationIdentifier column filled with NA + print("No nearby sites detected using input buffer distance.") } + # order columns + if ("ResultIdentifier" %in% names(.data)) { + .data <- TADA_OrderCols(.data) + } + return(.data) +} + # select and assign metadata randomly for grouped sites when meta_select equals "random" - + if(meta_select == "random") { - + select_meta <- grouped_sites %>% dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName) %>% @@ -929,9 +930,9 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected randomly") } - + if(meta_select == "oldest") { - + select_meta <- grouped_sites %>% dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName, @@ -945,11 +946,11 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random TADA.LongitudeMeasure.New = TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName) %>% dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from the oldest result available.") - + } - + if(meta_select == "newest") { - + select_meta <- grouped_sites %>% dplyr::select(TADA.MonitoringLocationIdentifier.New1, TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName, @@ -963,16 +964,16 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random TADA.LongitudeMeasure.New = TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName) %>% dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from the newest result available.") - + } - + if(meta_select == "count") { - + select_meta <- grouped_sites %>% dplyr::group_by(TADA.MonitoringLocationIdentifier.New1) %>% dplyr::mutate(NCount = length(TADA.ResultMeasureValue)) %>% dplyr::select(TADA.MonitoringLocationIdentifier, TADA.MonitoringLocationName, - TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName, + TADA.LatitudeMeasure, TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName, NCount) %>% dplyr::distinct() %>% dplyr::group_by(TADA.MonitoringLocationIdentifier) %>% @@ -985,13 +986,13 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from the newest result available.") } - + .data <- .data %>% dplyr::full_join(select_meta, by = dplyr::join_by(TADA.MonitoringLocationIdentifier.New1)) - + .data <- .data %>% dplyr::ungroup() %>% - dplyr::mutate(TADA.MonitoringLocationName = ifelse(!ResultIdentifier %in% grouped_resultids, + dplyr::mutate(TADA.MonitoringLocationName = ifelse(!ResultIdentifier %in% grouped_resultids, TADA.MonitoringLocationName, TADA.MonitoringLocationName.New), TADA.LatitudeMeasure = ifelse(!ResultIdentifier %in% grouped_resultids, TADA.LatitudeMeasure, TADA.LatitudeMeasure.New), @@ -1005,15 +1006,15 @@ TADA_FindNearbySites <- function(.data, dist_buffer = 100, meta_select = "random -TADA.LatitudeMeasure.New, -TADA.LongitudeMeasure.New, -TADA.MonitoringLocationTypeName.New) } - + if (dim(groups)[1] == 0) { - + print("No nearby sites detected using input buffer distance.") - + .data <- .data %>% dplyr::mutate(TADA.NearbySites.Flag = "No nearby sites detected using input buffer distance.") } - + .data <- TADA_OrderCols(.data) %>% dplyr::mutate(TADA.NearbySites.Flag = ifelse(is.na(TADA.NearbySites.Flag), "No nearby sites detected using input buffer distance.", diff --git a/R/test.R b/R/test.R new file mode 100644 index 00000000..b57baa36 --- /dev/null +++ b/R/test.R @@ -0,0 +1,100 @@ +TADA_FindNearbySites <- function(.data, dist_buffer = 100) { + # check .data is data.frame + TADA_CheckType(.data, "data.frame", "Input object") + + # .data required columns + required_cols <- c("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure") + # check .data has required columns + TADA_CheckColumns(.data, required_cols) + + # create spatial dataset based on sites + data_sf <- .data %>% + dplyr::select("MonitoringLocationIdentifier", "TADA.LongitudeMeasure", "TADA.LatitudeMeasure") %>% + unique() + + # convert to sf object + data_sf <- sf::st_as_sf(data_sf, + coords = c("TADA.LongitudeMeasure", "TADA.LatitudeMeasure"), + # Change to your CRS + crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" + ) + # create a distance matrix in meters + dist.mat <- data.frame(sf::st_distance(data_sf)) # Great Circle distance since in lat/lon + + row.names(dist.mat) <- data_sf$MonitoringLocationIdentifier + colnames(dist.mat) <- data_sf$MonitoringLocationIdentifier + + # convert distances to those within buffer (1) and beyond buffer (0) + dist.mat1 <- apply(dist.mat, c(1, 2), function(x) { + if (x <= dist_buffer) { + x <- 1 + } else { + x <- 0 + } + }) + + # create empty dataframe for groups + groups <- data.frame() + + # loop through distance matrix and extract site groups that are within the buffer distance from one another + for (i in 1:dim(dist.mat1)[1]) { + fsite <- rownames(dist.mat1)[i] # focal site + dat <- data.frame(Count = dist.mat1[i, ]) # get focal site count row as a column + dat$MonitoringLocationIdentifier <- colnames(dist.mat1) # give df site names along with counts + sites <- dat$MonitoringLocationIdentifier[dat$Count == 1] # filter to sites within buffer + sites1 <- sites[!sites %in% fsite] # get site list within buffer that does not include focal site + if (length(sites1) > 0) { # if this list is greater than 0, combine sites within buffer into data frame + df <- data.frame(MonitoringLocationIdentifier = sites, TADA.MonitoringLocationIdentifier = paste0(sites, collapse = ",")) + df[c("TADA.MonitoringLocationIdentifier")] <- lapply(df[c("TADA.MonitoringLocationIdentifier")], TADA_FormatDelimitedString) + groups <- plyr::rbind.fill(groups, df) + } + } + + # get unique groups (since represented multiple times for each site looped through, above) + groups <- unique(groups) + + if (dim(groups)[1] > 0) { + # create group id numbers + group_ids <- groups %>% + dplyr::group_by(TADA.MonitoringLocationIdentifier) %>% + dplyr::mutate(TADA.SiteGroup = dplyr::cur_group_id()) %>% + dplyr::ungroup() %>% + dplyr::group_by(MonitoringLocationIdentifier) %>% + dplyr::mutate(TADA.MonitoringLocationIdentifier = paste(TADA.MonitoringLocationIdentifier, collapse = ","), + TADA.SiteGroup = paste(TADA.SiteGroup, collapse = ",")) %>% + dplyr::distinct() %>% + dplyr::ungroup() + } + + # find any sites within multiple groups + summ_sites <- group_ids %>% + dplyr::group_by(MonitoringLocationIdentifier) %>% + dplyr::mutate(GroupCount = 1:length(MonitoringLocationIdentifier)) + + # pivot wider if a site belongs to multiple groups + groups_prep <- merge(group_ids, summ_sites, all.x = TRUE) + groups_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.MonitoringLocationIdentifier", values_from = "TADA.MonitoringLocationIdentifier") + ids_wide <- tidyr::pivot_wider(groups_prep, id_cols = "MonitoringLocationIdentifier", names_from = "GroupCount", names_prefix = "TADA.SiteGroup", values_from = "TADA.SiteGroup") + # merge data to site groupings + .data <- merge(.data, groups_wide, all.x = TRUE) + .data <- merge(.data, ids_wide, all.x = TRUE) + + # concatenate and move site id cols to right place + grpcols <- names(.data)[grepl("TADA.MonitoringLocationIdentifier", names(.data))] + idcols <- names (.data)[grepl("TADA.SIteGroup", names(.data))] + + .data <- .data %>% tidyr::unite(col = TADA.MonitoringLocationIdentifier.New, dplyr::all_of(grpcols), sep = ", ", na.rm = TRUE) + .data <- .data %>% tidyr::unite(col = TADA.SiteGroup, dplyr::all_of(idcols), sep = ", ", na.rm = TRUE) + + if (dim(groups)[1] == 0) { # #if no groups, give a TADA.MonitoringLocationIdentifier column filled with NA + print("No nearby sites detected using input buffer distance.") + } + + # order columns + if ("ResultIdentifier" %in% names(.data)) { + .data <- TADA_OrderCols(.data) + } + + return(.data) +} +