From 2321e6dd32d079c0174467e82ec2fd330abbefcf Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 10 Feb 2024 17:40:39 +0100 Subject: [PATCH 1/4] try with collapse --- R/jaccard_join_core.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/jaccard_join_core.R b/R/jaccard_join_core.R index ec7e7cb..5ac4b2a 100644 --- a/R/jaccard_join_core.R +++ b/R/jaccard_join_core.R @@ -152,8 +152,9 @@ jaccard_join <- function (a, b, mode, by, salt_by, n_gram_width, n_bands, return(matches) } - not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] - not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + browser() + not_matched_a <- collapse::`%!iin%`(seq(nrow(a)), match_table[,1]) + not_matched_b <- collapse::`%!iin%`(seq(nrow(b)), match_table[,2]) if (mode == "left") { matches <- dplyr::bind_rows(matches,a[not_matched_a,]) From 518edd31b1acf8e03d4a62b71651b509b9c3b013 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 12 Feb 2024 15:08:28 +0100 Subject: [PATCH 2/4] more --- R/jaccard_join_core.R | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/R/jaccard_join_core.R b/R/jaccard_join_core.R index 5ac4b2a..954e7e4 100644 --- a/R/jaccard_join_core.R +++ b/R/jaccard_join_core.R @@ -152,21 +152,25 @@ jaccard_join <- function (a, b, mode, by, salt_by, n_gram_width, n_bands, return(matches) } - browser() - not_matched_a <- collapse::`%!iin%`(seq(nrow(a)), match_table[,1]) - not_matched_b <- collapse::`%!iin%`(seq(nrow(b)), match_table[,2]) - - if (mode == "left") { - matches <- dplyr::bind_rows(matches,a[not_matched_a,]) - } else if (mode == "right") { - matches <- dplyr::bind_rows(matches,b[not_matched_b,]) - } else if (mode == "full") { - matches <- dplyr::bind_rows(matches,a[not_matched_a,],b[not_matched_b,]) - } else if (mode == "anti") { - matches <- dplyr::bind_rows(a[not_matched_a,], b[not_matched_b,]) - } else { - stop("Invalid Mode Selected!") - } - - return(matches) + switch( + mode, + "left" = { + not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] + matches <- dplyr::bind_rows(matches, a[not_matched_a,]) + }, + "right" = { + not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + matches <- dplyr::bind_rows(matches, b[not_matched_b,]) + }, + "full" = { + not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] + not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + matches <- dplyr::bind_rows(matches, a[not_matched_a,], b[not_matched_b,]) + }, + "anti" = { + not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] + not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + matches <- dplyr::bind_rows(a[not_matched_a,], b[not_matched_b,]) + } + ) } From 525310ffdb703cd1c025080b1063b258efc93d84 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 12 Feb 2024 15:14:53 +0100 Subject: [PATCH 3/4] bring collapse back --- DESCRIPTION | 1 + R/jaccard_join_core.R | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e6b5bba..94adfa4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,6 +15,7 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 SystemRequirements: Cargo (>= 1.56) (Rust's package manager), rustc Imports: + collapse, dplyr, tibble, tidyr diff --git a/R/jaccard_join_core.R b/R/jaccard_join_core.R index 954e7e4..a7a51df 100644 --- a/R/jaccard_join_core.R +++ b/R/jaccard_join_core.R @@ -155,21 +155,21 @@ jaccard_join <- function (a, b, mode, by, salt_by, n_gram_width, n_bands, switch( mode, "left" = { - not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] + not_matched_a <- collapse::`%!iin%`(seq(nrow(a)), match_table[, 1]) matches <- dplyr::bind_rows(matches, a[not_matched_a,]) }, "right" = { - not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + not_matched_b <- collapse::`%!iin%`(seq(nrow(b)), match_table[, 2]) matches <- dplyr::bind_rows(matches, b[not_matched_b,]) }, "full" = { - not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] - not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + not_matched_a <- collapse::`%!iin%`(seq(nrow(a)), match_table[, 1]) + not_matched_b <- collapse::`%!iin%`(seq(nrow(b)), match_table[, 2]) matches <- dplyr::bind_rows(matches, a[not_matched_a,], b[not_matched_b,]) }, "anti" = { - not_matched_a <- ! seq(nrow(a)) %in% match_table[,1] - not_matched_b <- ! seq(nrow(b)) %in% match_table[,2] + not_matched_a <- collapse::`%!iin%`(seq(nrow(a)), match_table[, 1]) + not_matched_b <- collapse::`%!iin%`(seq(nrow(b)), match_table[, 2]) matches <- dplyr::bind_rows(a[not_matched_a,], b[not_matched_b,]) } ) From 4b79275d16eec002aa5b0aa33de962f21f3f5f86 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 12 Feb 2024 15:21:08 +0100 Subject: [PATCH 4/4] bump news --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 94adfa4..45a06a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: zoomerjoin Title: Superlatively Fast Fuzzy Joins -Version: 0.1.4 +Version: 0.1.2.9000 Authors@R: c( person("Beniamino", "Green", , "beniamino.green@yale.edu", role = c("aut", "cre", "cph")), person("Etienne", "Bacher", email = "etienne.bacher@protonmail.com", role = "ctb"), diff --git a/NEWS.md b/NEWS.md index a58cd49..d809562 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# zoomerjoin (development version) + +* Several performance improvements (#101, #104). + # zoomerjoin 0.1.2 * Submitted Package to CRAN