Merge pull request #95 from etiennebacher/support-join-by

Support for `dplyr::join_by()`
beniaminogreen · Jan 16, 2024 · 9e4270b · 9e4270b
2 parents 20ed57c + fc56002
commit 9e4270b
Show file tree

Hide file tree

Showing 17 changed files with 161 additions and 257 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,15 @@
 Package: zoomerjoin
 Title: Insanely fast fuzzy joins
 Version: 0.1.1
-Authors@R:
-    person("Beniamino", "Green", , "[email protected]", role = c("aut", "cre")
-           )
+Authors@R: c(
+    person("Beniamino", "Green", , "[email protected]", role = c("aut", "cre")),
+    person("Etienne", "Bacher", email = "[email protected]", role = "ctb")
+    )
 Description: Zoomerjoin empowers users to fuzzily-merge dataframes with millions or tens of millions of rows in minuites with low memory usage.  The package uses two locality sensitive hashing algorithms to avoid having to compare every pair of records in each dataset, resulting in fuzzy-merges that finish in linear time.
 License: GPL (>= 3)
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
 SystemRequirements: Cargo (>= 1.56) (Rust's package manager), rustc
 Imports:
     dplyr,

diff --git a/R/euclidean_join_core.R b/R/euclidean_join_core.R
@@ -1,4 +1,14 @@
 multi_by_validate <- function(a,b, by) {
+    # first pass to handle dplyr::join_by() call
+    if (inherits(by, "dplyr_join_by")) {
+        if (any(by$condition != "==")) {
+            stop("Inequality joins are not supported.")
+        }
+        new_by <- by$y
+        names(new_by) <- by$x
+        by <- new_by
+    }
+
     if (is.null(by)) {
         by_a <- intersect(names(a), names(b))
         by_b <- intersect(names(a), names(b))

diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R
@@ -1,11 +1,12 @@
 #' Spatial Anti Join Using LSH
 #'
 #' @param a the first dataframe you wish to join.
-#' @param b the second dataframe
-#' you wish to join.
+#' @param b the second dataframe you wish to join.
 #'
 #' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}.
+#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
+#' two columns must be specified in each dataset (x column and y column). Specification
+#' made with `dplyr::join_by()` are also accepted.
 #'
 #'
 #' @param threshold the distance threshold below which units should be considered a match
@@ -89,17 +90,7 @@ euclidean_inner_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, b
 
 #' Spatial Left Join Using LSH
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe
-#' you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-#' two columns must be specified in each dataset (x column and y column).
-#'
-#' @param threshold the distance threshold below which units should be considered a match
-#'
-#' @param progress set to `TRUE` to print progress
+#' @inheritParams euclidean_anti_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same
@@ -132,17 +123,7 @@ euclidean_left_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, ba
 
 #' Spatial Right Join Using LSH
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe
-#' you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-#' two columns must be specified in each dataset (x column and y column).
-#'
-#' @param threshold the distance threshold below which units should be considered a match
-#'
-#' @param progress set to `TRUE` to print progress
+#' @inheritParams euclidean_anti_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same
@@ -174,17 +155,7 @@ euclidean_right_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, b
 
 #' Spatial Full Join Using LSH
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe
-#' you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-#' two columns must be specified in each dataset (x column and y column).
-#'
-#' @param threshold the distance threshold below which units should be considered a match
-#'
-#' @param progress set to `TRUE` to print progress
+#' @inheritParams euclidean_anti_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -6,10 +6,10 @@
 # This file was created with the following call:
 #   .Call("wrap__make_zoomerjoin_wrappers", use_symbols = TRUE, package_name = "zoomerjoin")
 
-#' @docType package
 #' @usage NULL
 #' @useDynLib zoomerjoin, .registration = TRUE
-NULL
+#' @keywords internal
+"_PACKAGE"
 
 rust_jaccard_join <- function(left_string_r, right_string_r, ngram_width, n_bands, band_size, threshold, progress, seed) .Call(wrap__rust_jaccard_join, left_string_r, right_string_r, ngram_width, n_bands, band_size, threshold, progress, seed)
 

diff --git a/R/jaccard_join_core.R b/R/jaccard_join_core.R
@@ -1,4 +1,14 @@
 simple_by_validate <- function(a,b, by) {
+    # first pass to handle dplyr::join_by() call
+    if (inherits(by, "dplyr_join_by")) {
+        if (any(by$condition != "==")) {
+            stop("Inequality joins are not supported.")
+        }
+        new_by <- by$y
+        names(new_by) <- by$x
+        by <- new_by
+    }
+
     if (is.null(by)) {
         by_a <- intersect(names(a), names(b))
         by_b <- intersect(names(a), names(b))

diff --git a/R/jaccard_logical_joins.R b/R/jaccard_logical_joins.R
@@ -5,8 +5,9 @@
 #' @param b the second dataframe you wish to join.
 #'
 #' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
+#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
+#' two columns must be specified in each dataset (x column and y column). Specification
+#' made with `dplyr::join_by()` are also accepted.
 #'
 #' @param block_by a named vector indicating which column to block on, such that
 #' rows that disagree on this field cannot be considered a match. Format should
@@ -101,52 +102,7 @@ jaccard_inner_join <- function(a, b,
 
 #' Fuzzy anti-join using minihashing
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format
-#' should be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param block_by a named vector indicating which column to block on, such that
-#' rows that disagree on this field cannot be considered a match. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param n_gram_width the length of the n_grams used in calculating the
-#' jaccard similarity. For best performance, I set this large enough that the
-#' chance any string has a specific n_gram is low (i.e. \code{n_gram_width} = 2
-#' or 3 when matching on first names, 5 or 6 when matching on entire
-#' sentences).
-#'
-#' @param n_bands the number of bands used in the minihash algorithm (default
-#' is 40). Use this in conjunction with the \code{band_width} to determine the
-#' performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param band_width the length of each band used in the minihashing algorithm
-#' (default is 8) Use this in conjunction with the \code{n_bands} to determine
-#' the performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param threshold the jaccard similarity threshold above which two strings
-#' should be considered a match (default is .95). The similarity is euqal to 1
-#' - the jaccard distance between the two strings, so 1 implies the strings are
-#' identical, while a similarity of zero implies the strings are completely
-#' dissimilar.
-#'
-#' @param clean should the strings that you fuzzy join on be cleaned (coerced
-#' to lower-case, stripped of punctuation and spaces)? Default is FALSE
-#'
-#' @param progress set to `TRUE` to print progress
-#'
-#' @param similarity_column an optional character vector. If provided, the data
-#' frame will contain a column with this name giving the jaccard similarity
-#' between the two fields. Extra column will not be present if anti-joining.
+#' @inheritParams jaccard_inner_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same
@@ -197,52 +153,7 @@ jaccard_anti_join <- function(a, b,
 
 #' Fuzzy left-join using minihashing
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format
-#' should be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param block_by a named vector indicating which column to block on, such that
-#' rows that disagree on this field cannot be considered a match. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param n_gram_width the length of the n_grams used in calculating the
-#' jaccard similarity. For best performance, I set this large enough that the
-#' chance any string has a specific n_gram is low (i.e. \code{n_gram_width} = 2
-#' or 3 when matching on first names, 5 or 6 when matching on entire
-#' sentences).
-#'
-#' @param n_bands the number of bands used in the minihash algorithm (default
-#' is 40). Use this in conjunction with the \code{band_width} to determine the
-#' performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param band_width the length of each band used in the minihashing algorithm
-#' (default is 8) Use this in conjunction with the \code{n_bands} to determine
-#' the performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param threshold the jaccard similarity threshold above which two strings
-#' should be considered a match (default is .95). The similarity is euqal to 1
-#' - the jaccard distance between the two strings, so 1 implies the strings are
-#' identical, while a similarity of zero implies the strings are completely
-#' dissimilar.
-#' '
-#' @param clean should the strings that you fuzzy join on be cleaned (coerced
-#' to lower-case, stripped of punctuation and spaces)? Default is FALSE
-#'
-#' @param progress set to `TRUE` to print progress
-#'
-#' @param similarity_column an optional character vector. If provided, the data
-#' frame will contain a column with this name giving the jaccard similarity
-#' between the two fields. Extra column will not be present if anti-joining.
+#' @inheritParams jaccard_inner_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same
@@ -297,52 +208,7 @@ jaccard_left_join <- function(a, b,
 
 #' Fuzzy right-join using minihashing
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format
-#' should be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param block_by a named vector indicating which column to block on, such that
-#' rows that disagree on this field cannot be considered a match. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param n_gram_width the length of the n_grams used in calculating the
-#' jaccard similarity. For best performance, I set this large enough that the
-#' chance any string has a specific n_gram is low (i.e. \code{n_gram_width} = 2
-#' or 3 when matching on first names, 5 or 6 when matching on entire
-#' sentences).
-#'
-#' @param n_bands the number of bands used in the minihash algorithm (default
-#' is 40). Use this in conjunction with the \code{band_width} to determine the
-#' performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param band_width the length of each band used in the minihashing algorithm
-#' (default is 8) Use this in conjunction with the \code{n_bands} to determine
-#' the performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param threshold the jaccard similarity threshold above which two strings
-#' should be considered a match (default is .95). The similarity is euqal to 1
-#' - the jaccard distance between the two strings, so 1 implies the strings are
-#' identical, while a similarity of zero implies the strings are completely
-#' dissimilar.
-#' '
-#' @param clean should the strings that you fuzzy join on be cleaned (coerced
-#' to lower-case, stripped of punctuation and spaces)? Default is FALSE
-#'
-#' @param progress set to `TRUE` to print progress
-#'
-#' @param similarity_column an optional character vector. If provided, the data
-#' frame will contain a column with this name giving the jaccard similarity
-#' between the two fields. Extra column will not be present if anti-joining.
+#' @inheritParams jaccard_inner_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same
@@ -397,52 +263,7 @@ jaccard_right_join <- function(a, b,
 
 #' Fuzzy full-join using minihashing
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format
-#' should be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param block_by a named vector indicating which column to block on, such that
-#' rows that disagree on this field cannot be considered a match. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" =
-#' "column_name_in_df_b")}
-#'
-#' @param n_gram_width the length of the n_grams used in calculating the
-#' jaccard similarity. For best performance, I set this large enough that the
-#' chance any string has a specific n_gram is low (i.e. \code{n_gram_width} = 2
-#' or 3 when matching on first names, 5 or 6 when matching on entire
-#' sentences).
-#'
-#' @param n_bands the number of bands used in the minihash algorithm (default
-#' is 40). Use this in conjunction with the \code{band_width} to determine the
-#' performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param band_width the length of each band used in the minihashing algorithm
-#' (default is 8) Use this in conjunction with the \code{n_bands} to determine
-#' the performance of the hashing. The default settings are for a
-#' (.2,.8,.001,.999)-sensitive hash i.e. that pairs with a similarity of less
-#' than .2 have a >.1% chance of being compared, while pairs with a similarity
-#' of greater than .8 have a >99.9% chance of being compared.
-#'
-#' @param threshold the jaccard similarity threshold above which two strings
-#' should be considered a match (default is .95). The similarity is euqal to 1
-#' - the jaccard distance between the two strings, so 1 implies the strings are
-#' identical, while a similarity of zero implies the strings are completely
-#' dissimilar.
-#' '
-#' @param clean should the strings that you fuzzy join on be cleaned (coerced
-#' to lower-case, stripped of punctuation and spaces)? Default is FALSE
-#'
-#' @param progress set to `TRUE` to print progress
-#'
-#' @param similarity_column an optional character vector. If provided, the data
-#' frame will contain a column with this name giving the jaccard similarity
-#' between the two fields. Extra column will not be present if anti-joining.
+#' @inheritParams jaccard_inner_join
 #'
 #' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #' to adhere to the same standards as the dplyr-joins, and uses the same

diff --git a/man/euclidean_anti_join.Rd b/man/euclidean_anti_join.Rd