From fa700035b95a29c96f9b9bc8921bf1ad13660f55 Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Wed, 14 Feb 2024 11:28:52 -0500 Subject: [PATCH 1/4] updated documentation and NEWS.md --- NEWS.md | 3 +- R/hamming_logical_joins.R | 39 ++----------- man/em_link.Rd | 26 ++++----- ...hamming_inner_join.Rd => hamming-joins.Rd} | 54 +++++++++++++++++- man/hamming_anti_join.Rd | 57 ------------------- man/hamming_full_join.Rd | 57 ------------------- man/hamming_left_join.Rd | 57 ------------------- man/hamming_right_join.Rd | 57 ------------------- man/jaccard_curve.Rd | 2 +- man/jaccard_hyper_grid_search.Rd | 2 +- man/jaccard_similarity.Rd | 6 +- man/jaccard_string_group.Rd | 8 ++- 12 files changed, 85 insertions(+), 283 deletions(-) rename man/{hamming_inner_join.Rd => hamming-joins.Rd} (73%) delete mode 100644 man/hamming_anti_join.Rd delete mode 100644 man/hamming_full_join.Rd delete mode 100644 man/hamming_left_join.Rd delete mode 100644 man/hamming_right_join.Rd diff --git a/NEWS.md b/NEWS.md index 826dc94..411c44b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,10 +3,11 @@ ## New features * Several performance improvements (#101, #104). +* Added support for joining based on hamming distance (#100). ## Bug fixes -* When `clean = TRUE`, strings were not coerced to lower case. This is now the +* When `clean = TRUE`, strings were not coerced to lower case. This is now the case (#105). * Fix argument `progress`, which didn't print anything when it was `TRUE` (#107). diff --git a/R/hamming_logical_joins.R b/R/hamming_logical_joins.R index ec1bb7a..fb2271a 100644 --- a/R/hamming_logical_joins.R +++ b/R/hamming_logical_joins.R @@ -1,4 +1,4 @@ -#' Fuzzy inner-join using minihashing +#' Fuzzy inner-join using Locality Sensitive Hashing #' #' Find similar rows between two tables using the hamming distance. The hamming #' distance is equal to the number characters two strings differ by, or is @@ -39,6 +39,7 @@ #' to adhere to the same standards as the dplyr-joins, and uses the same #' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). #' +#' @rdname hamming-joins #' @export hamming_inner_join <- function(a, b, by = NULL, @@ -58,14 +59,7 @@ hamming_inner_join <- function(a, b, clean=clean) } -#' Fuzzy anti-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_anti_join <- function(a, b, by = NULL, @@ -85,14 +79,7 @@ hamming_anti_join <- function(a, b, clean=clean) } -#' Fuzzy left-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_left_join <- function(a, b, by = NULL, @@ -112,14 +99,7 @@ hamming_left_join <- function(a, b, clean=clean) } -#' Fuzzy left-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_right_join <- function(a, b, by = NULL, @@ -140,14 +120,7 @@ hamming_right_join <- function(a, b, } -#' Fuzzy full-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_full_join <- function(a, b, by = NULL, diff --git a/man/em_link.Rd b/man/em_link.Rd index 5789d5e..298b131 100644 --- a/man/em_link.Rd +++ b/man/em_link.Rd @@ -43,29 +43,29 @@ algorithm until an optima is reached. for more details, see } \examples{ -inv_logit <- function (x) { - exp(x)/(1+exp(x)) +inv_logit <- function(x) { + exp(x) / (1 + exp(x)) } n <- 10^6 d <- 1:n \%\% 5 == 0 X <- cbind( - as.integer(ifelse(d, runif(n)<.8, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.9, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.7, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.6, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.5, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.1, runif(n)<.9)), - as.integer(ifelse(d, runif(n)<.1, runif(n)<.9)), - as.integer(ifelse(d, runif(n)<.8, runif(n)<.01)) - ) + as.integer(ifelse(d, runif(n) < .8, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .9, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .7, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .6, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .5, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .1, runif(n) < .9)), + as.integer(ifelse(d, runif(n) < .1, runif(n) < .9)), + as.integer(ifelse(d, runif(n) < .8, runif(n) < .01)) +) # inital guess at class assignments based on # a hypothetical logistic # regression. Should be based on domain knowledge, or a handful of hand-coded # observations. x_sum <- rowSums(X) -g <- inv_logit((x_sum - mean(x_sum))/sd(x_sum)) +g <- inv_logit((x_sum - mean(x_sum)) / sd(x_sum)) -out <- em_link(X, g,tol=.0001, max_iter = 100) +out <- em_link(X, g, tol = .0001, max_iter = 100) } diff --git a/man/hamming_inner_join.Rd b/man/hamming-joins.Rd similarity index 73% rename from man/hamming_inner_join.Rd rename to man/hamming-joins.Rd index e1d8a1b..b054e2d 100644 --- a/man/hamming_inner_join.Rd +++ b/man/hamming-joins.Rd @@ -2,7 +2,11 @@ % Please edit documentation in R/hamming_logical_joins.R \name{hamming_inner_join} \alias{hamming_inner_join} -\title{Fuzzy inner-join using minihashing} +\alias{hamming_anti_join} +\alias{hamming_left_join} +\alias{hamming_right_join} +\alias{hamming_full_join} +\title{Fuzzy inner-join using Locality Sensitive Hashing} \usage{ hamming_inner_join( a, @@ -15,6 +19,54 @@ hamming_inner_join( clean = FALSE, similarity_column = NULL ) + +hamming_anti_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) + +hamming_left_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) + +hamming_right_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) + +hamming_full_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) } \arguments{ \item{a}{the first dataframe you wish to join.} diff --git a/man/hamming_anti_join.Rd b/man/hamming_anti_join.Rd deleted file mode 100644 index beb537a..0000000 --- a/man/hamming_anti_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_anti_join} -\alias{hamming_anti_join} -\title{Fuzzy anti-join using minihashing} -\usage{ -hamming_anti_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy anti-join using minihashing -} diff --git a/man/hamming_full_join.Rd b/man/hamming_full_join.Rd deleted file mode 100644 index 3d226ff..0000000 --- a/man/hamming_full_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_full_join} -\alias{hamming_full_join} -\title{Fuzzy full-join using minihashing} -\usage{ -hamming_full_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy full-join using minihashing -} diff --git a/man/hamming_left_join.Rd b/man/hamming_left_join.Rd deleted file mode 100644 index a5efc08..0000000 --- a/man/hamming_left_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_left_join} -\alias{hamming_left_join} -\title{Fuzzy left-join using minihashing} -\usage{ -hamming_left_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy left-join using minihashing -} diff --git a/man/hamming_right_join.Rd b/man/hamming_right_join.Rd deleted file mode 100644 index 5bbc828..0000000 --- a/man/hamming_right_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_right_join} -\alias{hamming_right_join} -\title{Fuzzy left-join using minihashing} -\usage{ -hamming_right_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy left-join using minihashing -} diff --git a/man/jaccard_curve.Rd b/man/jaccard_curve.Rd index 68e3cd0..6c9e140 100644 --- a/man/jaccard_curve.Rd +++ b/man/jaccard_curve.Rd @@ -21,6 +21,6 @@ Plot S-Curve for a LSH with given hyperparameters \examples{ # Plot the probability two pairs will be matched as a function of their # jaccard similarity, given the hyperparameters n_bands and band_width. -jaccard_curve(40,6) +jaccard_curve(40, 6) } diff --git a/man/jaccard_hyper_grid_search.Rd b/man/jaccard_hyper_grid_search.Rd index f650492..a98d4a0 100644 --- a/man/jaccard_hyper_grid_search.Rd +++ b/man/jaccard_hyper_grid_search.Rd @@ -33,6 +33,6 @@ strings with .7 similarity have a 99.9\% chance of being compared. # Help me find the parameters that will minimize runtime while ensuring that # two strings with similarity .1 will be compared less than .1\% of the time, # strings with .8 similaity will have a 99.95\% chance of being compared: -jaccard_hyper_grid_search(.1,.9,.001,.995) +jaccard_hyper_grid_search(.1, .9, .001, .995) } diff --git a/man/jaccard_similarity.Rd b/man/jaccard_similarity.Rd index 4bb330e..6e2f370 100644 --- a/man/jaccard_similarity.Rd +++ b/man/jaccard_similarity.Rd @@ -21,7 +21,9 @@ a vector of jaccard similarities of the strings Calculate Jaccard Similarity of two character vectors } \examples{ -jaccard_similarity(c("the quick brown fox","jumped over the lazy dog"), - c("the quck bron fx","jumped over hte lazy dog")) +jaccard_similarity( + c("the quick brown fox", "jumped over the lazy dog"), + c("the quck bron fx", "jumped over hte lazy dog") +) } diff --git a/man/jaccard_string_group.Rd b/man/jaccard_string_group.Rd index 032c44e..2fd8071 100644 --- a/man/jaccard_string_group.Rd +++ b/man/jaccard_string_group.Rd @@ -59,8 +59,10 @@ in order to use this function. } \examples{ -string <- c("beniamino", "jack", "benjamin", "beniamin", - "jacky", "giacomo", "gaicomo") -jaccard_string_group(string, threshold = .2, n_bands=90, n_gram_width=1) +string <- c( + "beniamino", "jack", "benjamin", "beniamin", + "jacky", "giacomo", "gaicomo" +) +jaccard_string_group(string, threshold = .2, n_bands = 90, n_gram_width = 1) } From 4ceeb6114dfb1a8a562c8c9ec67de83a1c63811e Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Wed, 14 Feb 2024 11:33:08 -0500 Subject: [PATCH 2/4] adding example for hamming joins --- R/hamming_logical_joins.R | 31 +++++++++++++++++++++++++++++++ man/hamming-joins.Rd | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/R/hamming_logical_joins.R b/R/hamming_logical_joins.R index fb2271a..b55aa2f 100644 --- a/R/hamming_logical_joins.R +++ b/R/hamming_logical_joins.R @@ -41,6 +41,37 @@ #' #' @rdname hamming-joins #' @export +#' @examples +#' # load baby names data +#' # install.packages("babynames") +#' library(babynames) +#' +#' baby_names <- data.frame(name = tolower(unique(babynames$name))[1:500]) +#' baby_names_mispelled <- data.frame( +#' name_mispelled = gsub("[aeiouy]", "x", baby_names$name) +#' ) +#' +#' # Run the join and only keep rows that have a match: +#' hamming_inner_join( +#' baby_names, +#' baby_names_mispelled, +#' by = c("name" = "name_mispelled"), +#' threshold = 4, +#' n_bands = 100, +#' band_width = 10, +#' clean = FALSE # default +#' ) +#' +#' # Run the join and keep all rows from the first dataset, regardless of whether +#' # they have a match: +#' jaccard_left_join( +#' baby_names, +#' baby_names_mispelled, +#' by = c("name" = "name_mispelled"), +#' threshold = 4, +#' n_bands = 100, +#' band_width = 10, +#' ) hamming_inner_join <- function(a, b, by = NULL, n_bands = 100, diff --git a/man/hamming-joins.Rd b/man/hamming-joins.Rd index b054e2d..ee5319d 100644 --- a/man/hamming-joins.Rd +++ b/man/hamming-joins.Rd @@ -109,3 +109,35 @@ Find similar rows between two tables using the hamming distance. The hamming distance is equal to the number characters two strings differ by, or is equal to infinity if two strings are of different lengths } +\examples{ +# load baby names data +# install.packages("babynames") +library(babynames) + +baby_names <- data.frame(name = tolower(unique(babynames$name))[1:500]) +baby_names_mispelled <- data.frame( + name_mispelled = gsub("[aeiouy]", "x", baby_names$name) +) + +# Run the join and only keep rows that have a match: +hamming_inner_join( + baby_names, + baby_names_mispelled, + by = c("name" = "name_mispelled"), + threshold = 4, + n_bands = 100, + band_width = 10, + clean = FALSE # default +) + +# Run the join and keep all rows from the first dataset, regardless of whether +# they have a match: +jaccard_left_join( + baby_names, + baby_names_mispelled, + by = c("name" = "name_mispelled"), + threshold = 4, + n_bands = 100, + band_width = 10, +) +} From b5dcb9b09342d2c5d9105a62e52feaf29527cc23 Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Wed, 14 Feb 2024 11:35:28 -0500 Subject: [PATCH 3/4] capitalization --- R/euclidean_logical_joins.R | 2 +- R/hamming_logical_joins.R | 2 +- R/jaccard_logical_joins.R | 2 +- man/euclidean-joins.Rd | 4 ++-- man/hamming-joins.Rd | 2 +- man/jaccard-joins.Rd | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R index 1015bae..21500cb 100644 --- a/R/euclidean_logical_joins.R +++ b/R/euclidean_logical_joins.R @@ -1,4 +1,4 @@ -#' Spatial joins Using LSH +#' Fuzzy joins for Euclidean distance using Locality Sensitive Hashing #' #' @inheritParams jaccard_left_join #' @param threshold The distance threshold below which units should be diff --git a/R/hamming_logical_joins.R b/R/hamming_logical_joins.R index b55aa2f..97b18fd 100644 --- a/R/hamming_logical_joins.R +++ b/R/hamming_logical_joins.R @@ -1,4 +1,4 @@ -#' Fuzzy inner-join using Locality Sensitive Hashing +#' Fuzzy joins for Hamming distance using Locality Sensitive Hashing #' #' Find similar rows between two tables using the hamming distance. The hamming #' distance is equal to the number characters two strings differ by, or is diff --git a/R/jaccard_logical_joins.R b/R/jaccard_logical_joins.R index 67f7156..4d147d9 100644 --- a/R/jaccard_logical_joins.R +++ b/R/jaccard_logical_joins.R @@ -1,4 +1,4 @@ -#' Fuzzy joins using minihashing +#' Fuzzy joins for Jaccard distance using MinHash #' #' @param a,b The two dataframes to join. #' diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd index a068b2c..a87b285 100644 --- a/man/euclidean-joins.Rd +++ b/man/euclidean-joins.Rd @@ -6,7 +6,7 @@ \alias{euclidean_left_join} \alias{euclidean_right_join} \alias{euclidean_full_join} -\title{Spatial joins Using LSH} +\title{Fuzzy joins for Euclidean distance using Locality Sensitive Hashing} \usage{ euclidean_anti_join( a, @@ -106,7 +106,7 @@ logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). } \description{ -Spatial joins Using LSH +Fuzzy joins for Euclidean distance using Locality Sensitive Hashing } \examples{ n <- 10 diff --git a/man/hamming-joins.Rd b/man/hamming-joins.Rd index ee5319d..8f32722 100644 --- a/man/hamming-joins.Rd +++ b/man/hamming-joins.Rd @@ -6,7 +6,7 @@ \alias{hamming_left_join} \alias{hamming_right_join} \alias{hamming_full_join} -\title{Fuzzy inner-join using Locality Sensitive Hashing} +\title{Fuzzy joins for Hamming distance using Locality Sensitive Hashing} \usage{ hamming_inner_join( a, diff --git a/man/jaccard-joins.Rd b/man/jaccard-joins.Rd index 6378ed7..2c71d35 100644 --- a/man/jaccard-joins.Rd +++ b/man/jaccard-joins.Rd @@ -6,7 +6,7 @@ \alias{jaccard_left_join} \alias{jaccard_right_join} \alias{jaccard_full_join} -\title{Fuzzy joins using minihashing} +\title{Fuzzy joins for Jaccard distance using MinHash} \usage{ jaccard_inner_join( a, @@ -131,7 +131,7 @@ logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). } \description{ -Fuzzy joins using minihashing +Fuzzy joins for Jaccard distance using MinHash } \examples{ # load baby names data From 55651ad928a39cadebdd2f8142ea1878aaf10cc8 Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Wed, 14 Feb 2024 11:42:26 -0500 Subject: [PATCH 4/4] tweaks to remove R CMD NOTES --- NAMESPACE | 1 + R/hamming_logical_joins.R | 10 +++++----- R/string_group.R | 2 +- man/hamming-joins.Rd | 10 +++++----- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 14a4b6a..43500fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,4 +29,5 @@ importFrom(dplyr,pull) importFrom(stats,pnorm) importFrom(stats,runif) importFrom(utils,installed.packages) +importFrom(utils,packageVersion) useDynLib(zoomerjoin, .registration = TRUE) diff --git a/R/hamming_logical_joins.R b/R/hamming_logical_joins.R index 97b18fd..3ec6694 100644 --- a/R/hamming_logical_joins.R +++ b/R/hamming_logical_joins.R @@ -56,20 +56,20 @@ #' baby_names, #' baby_names_mispelled, #' by = c("name" = "name_mispelled"), -#' threshold = 4, -#' n_bands = 100, +#' threshold = 3, +#' n_bands = 150, #' band_width = 10, #' clean = FALSE # default #' ) #' #' # Run the join and keep all rows from the first dataset, regardless of whether #' # they have a match: -#' jaccard_left_join( +#' hamming_left_join( #' baby_names, #' baby_names_mispelled, #' by = c("name" = "name_mispelled"), -#' threshold = 4, -#' n_bands = 100, +#' threshold = 3, +#' n_bands = 150, #' band_width = 10, #' ) hamming_inner_join <- function(a, b, diff --git a/R/string_group.R b/R/string_group.R index 7c5f84f..0a93f37 100644 --- a/R/string_group.R +++ b/R/string_group.R @@ -50,7 +50,7 @@ #' #' @export #' @importFrom stats runif -#' @importFrom utils installed.packages +#' @importFrom utils installed.packages packageVersion jaccard_string_group <- function(string, n_gram_width = 2, n_bands = 45, band_width = 8, threshold = .7, progress = FALSE) { if (system.file(package = "igraph") == "") { stop("library 'igraph' must be installed to run this function") diff --git a/man/hamming-joins.Rd b/man/hamming-joins.Rd index 8f32722..6a42251 100644 --- a/man/hamming-joins.Rd +++ b/man/hamming-joins.Rd @@ -124,20 +124,20 @@ hamming_inner_join( baby_names, baby_names_mispelled, by = c("name" = "name_mispelled"), - threshold = 4, - n_bands = 100, + threshold = 3, + n_bands = 150, band_width = 10, clean = FALSE # default ) # Run the join and keep all rows from the first dataset, regardless of whether # they have a match: -jaccard_left_join( +hamming_left_join( baby_names, baby_names_mispelled, by = c("name" = "name_mispelled"), - threshold = 4, - n_bands = 100, + threshold = 3, + n_bands = 150, band_width = 10, ) }