diff --git a/NAMESPACE b/NAMESPACE index 14a4b6a..43500fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,4 +29,5 @@ importFrom(dplyr,pull) importFrom(stats,pnorm) importFrom(stats,runif) importFrom(utils,installed.packages) +importFrom(utils,packageVersion) useDynLib(zoomerjoin, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index 826dc94..411c44b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,10 +3,11 @@ ## New features * Several performance improvements (#101, #104). +* Added support for joining based on hamming distance (#100). ## Bug fixes -* When `clean = TRUE`, strings were not coerced to lower case. This is now the +* When `clean = TRUE`, strings were not coerced to lower case. This is now the case (#105). * Fix argument `progress`, which didn't print anything when it was `TRUE` (#107). diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R index 1015bae..21500cb 100644 --- a/R/euclidean_logical_joins.R +++ b/R/euclidean_logical_joins.R @@ -1,4 +1,4 @@ -#' Spatial joins Using LSH +#' Fuzzy joins for Euclidean distance using Locality Sensitive Hashing #' #' @inheritParams jaccard_left_join #' @param threshold The distance threshold below which units should be diff --git a/R/hamming_logical_joins.R b/R/hamming_logical_joins.R index ec1bb7a..3ec6694 100644 --- a/R/hamming_logical_joins.R +++ b/R/hamming_logical_joins.R @@ -1,4 +1,4 @@ -#' Fuzzy inner-join using minihashing +#' Fuzzy joins for Hamming distance using Locality Sensitive Hashing #' #' Find similar rows between two tables using the hamming distance. The hamming #' distance is equal to the number characters two strings differ by, or is @@ -39,7 +39,39 @@ #' to adhere to the same standards as the dplyr-joins, and uses the same #' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). #' +#' @rdname hamming-joins #' @export +#' @examples +#' # load baby names data +#' # install.packages("babynames") +#' library(babynames) +#' +#' baby_names <- data.frame(name = tolower(unique(babynames$name))[1:500]) +#' baby_names_mispelled <- data.frame( +#' name_mispelled = gsub("[aeiouy]", "x", baby_names$name) +#' ) +#' +#' # Run the join and only keep rows that have a match: +#' hamming_inner_join( +#' baby_names, +#' baby_names_mispelled, +#' by = c("name" = "name_mispelled"), +#' threshold = 3, +#' n_bands = 150, +#' band_width = 10, +#' clean = FALSE # default +#' ) +#' +#' # Run the join and keep all rows from the first dataset, regardless of whether +#' # they have a match: +#' hamming_left_join( +#' baby_names, +#' baby_names_mispelled, +#' by = c("name" = "name_mispelled"), +#' threshold = 3, +#' n_bands = 150, +#' band_width = 10, +#' ) hamming_inner_join <- function(a, b, by = NULL, n_bands = 100, @@ -58,14 +90,7 @@ hamming_inner_join <- function(a, b, clean=clean) } -#' Fuzzy anti-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_anti_join <- function(a, b, by = NULL, @@ -85,14 +110,7 @@ hamming_anti_join <- function(a, b, clean=clean) } -#' Fuzzy left-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_left_join <- function(a, b, by = NULL, @@ -112,14 +130,7 @@ hamming_left_join <- function(a, b, clean=clean) } -#' Fuzzy left-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_right_join <- function(a, b, by = NULL, @@ -140,14 +151,7 @@ hamming_right_join <- function(a, b, } -#' Fuzzy full-join using minihashing -#' -#' @inheritParams hamming_inner_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' +#' @rdname hamming-joins #' @export hamming_full_join <- function(a, b, by = NULL, diff --git a/R/jaccard_logical_joins.R b/R/jaccard_logical_joins.R index 67f7156..4d147d9 100644 --- a/R/jaccard_logical_joins.R +++ b/R/jaccard_logical_joins.R @@ -1,4 +1,4 @@ -#' Fuzzy joins using minihashing +#' Fuzzy joins for Jaccard distance using MinHash #' #' @param a,b The two dataframes to join. #' diff --git a/R/string_group.R b/R/string_group.R index 7c5f84f..0a93f37 100644 --- a/R/string_group.R +++ b/R/string_group.R @@ -50,7 +50,7 @@ #' #' @export #' @importFrom stats runif -#' @importFrom utils installed.packages +#' @importFrom utils installed.packages packageVersion jaccard_string_group <- function(string, n_gram_width = 2, n_bands = 45, band_width = 8, threshold = .7, progress = FALSE) { if (system.file(package = "igraph") == "") { stop("library 'igraph' must be installed to run this function") diff --git a/man/em_link.Rd b/man/em_link.Rd index 5789d5e..298b131 100644 --- a/man/em_link.Rd +++ b/man/em_link.Rd @@ -43,29 +43,29 @@ algorithm until an optima is reached. for more details, see } \examples{ -inv_logit <- function (x) { - exp(x)/(1+exp(x)) +inv_logit <- function(x) { + exp(x) / (1 + exp(x)) } n <- 10^6 d <- 1:n \%\% 5 == 0 X <- cbind( - as.integer(ifelse(d, runif(n)<.8, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.9, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.7, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.6, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.5, runif(n)<.2)), - as.integer(ifelse(d, runif(n)<.1, runif(n)<.9)), - as.integer(ifelse(d, runif(n)<.1, runif(n)<.9)), - as.integer(ifelse(d, runif(n)<.8, runif(n)<.01)) - ) + as.integer(ifelse(d, runif(n) < .8, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .9, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .7, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .6, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .5, runif(n) < .2)), + as.integer(ifelse(d, runif(n) < .1, runif(n) < .9)), + as.integer(ifelse(d, runif(n) < .1, runif(n) < .9)), + as.integer(ifelse(d, runif(n) < .8, runif(n) < .01)) +) # inital guess at class assignments based on # a hypothetical logistic # regression. Should be based on domain knowledge, or a handful of hand-coded # observations. x_sum <- rowSums(X) -g <- inv_logit((x_sum - mean(x_sum))/sd(x_sum)) +g <- inv_logit((x_sum - mean(x_sum)) / sd(x_sum)) -out <- em_link(X, g,tol=.0001, max_iter = 100) +out <- em_link(X, g, tol = .0001, max_iter = 100) } diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd index a068b2c..a87b285 100644 --- a/man/euclidean-joins.Rd +++ b/man/euclidean-joins.Rd @@ -6,7 +6,7 @@ \alias{euclidean_left_join} \alias{euclidean_right_join} \alias{euclidean_full_join} -\title{Spatial joins Using LSH} +\title{Fuzzy joins for Euclidean distance using Locality Sensitive Hashing} \usage{ euclidean_anti_join( a, @@ -106,7 +106,7 @@ logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). } \description{ -Spatial joins Using LSH +Fuzzy joins for Euclidean distance using Locality Sensitive Hashing } \examples{ n <- 10 diff --git a/man/hamming_inner_join.Rd b/man/hamming-joins.Rd similarity index 59% rename from man/hamming_inner_join.Rd rename to man/hamming-joins.Rd index e1d8a1b..6a42251 100644 --- a/man/hamming_inner_join.Rd +++ b/man/hamming-joins.Rd @@ -2,7 +2,11 @@ % Please edit documentation in R/hamming_logical_joins.R \name{hamming_inner_join} \alias{hamming_inner_join} -\title{Fuzzy inner-join using minihashing} +\alias{hamming_anti_join} +\alias{hamming_left_join} +\alias{hamming_right_join} +\alias{hamming_full_join} +\title{Fuzzy joins for Hamming distance using Locality Sensitive Hashing} \usage{ hamming_inner_join( a, @@ -15,6 +19,54 @@ hamming_inner_join( clean = FALSE, similarity_column = NULL ) + +hamming_anti_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) + +hamming_left_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) + +hamming_right_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) + +hamming_full_join( + a, + b, + by = NULL, + n_bands = 100, + band_width = 100, + threshold = 2, + progress = FALSE, + clean = FALSE, + similarity_column = NULL +) } \arguments{ \item{a}{the first dataframe you wish to join.} @@ -57,3 +109,35 @@ Find similar rows between two tables using the hamming distance. The hamming distance is equal to the number characters two strings differ by, or is equal to infinity if two strings are of different lengths } +\examples{ +# load baby names data +# install.packages("babynames") +library(babynames) + +baby_names <- data.frame(name = tolower(unique(babynames$name))[1:500]) +baby_names_mispelled <- data.frame( + name_mispelled = gsub("[aeiouy]", "x", baby_names$name) +) + +# Run the join and only keep rows that have a match: +hamming_inner_join( + baby_names, + baby_names_mispelled, + by = c("name" = "name_mispelled"), + threshold = 3, + n_bands = 150, + band_width = 10, + clean = FALSE # default +) + +# Run the join and keep all rows from the first dataset, regardless of whether +# they have a match: +hamming_left_join( + baby_names, + baby_names_mispelled, + by = c("name" = "name_mispelled"), + threshold = 3, + n_bands = 150, + band_width = 10, +) +} diff --git a/man/hamming_anti_join.Rd b/man/hamming_anti_join.Rd deleted file mode 100644 index beb537a..0000000 --- a/man/hamming_anti_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_anti_join} -\alias{hamming_anti_join} -\title{Fuzzy anti-join using minihashing} -\usage{ -hamming_anti_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy anti-join using minihashing -} diff --git a/man/hamming_full_join.Rd b/man/hamming_full_join.Rd deleted file mode 100644 index 3d226ff..0000000 --- a/man/hamming_full_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_full_join} -\alias{hamming_full_join} -\title{Fuzzy full-join using minihashing} -\usage{ -hamming_full_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy full-join using minihashing -} diff --git a/man/hamming_left_join.Rd b/man/hamming_left_join.Rd deleted file mode 100644 index a5efc08..0000000 --- a/man/hamming_left_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_left_join} -\alias{hamming_left_join} -\title{Fuzzy left-join using minihashing} -\usage{ -hamming_left_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy left-join using minihashing -} diff --git a/man/hamming_right_join.Rd b/man/hamming_right_join.Rd deleted file mode 100644 index 5bbc828..0000000 --- a/man/hamming_right_join.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hamming_logical_joins.R -\name{hamming_right_join} -\alias{hamming_right_join} -\title{Fuzzy left-join using minihashing} -\usage{ -hamming_right_join( - a, - b, - by = NULL, - n_bands = 100, - band_width = 100, - threshold = 2, - progress = FALSE, - clean = FALSE, - similarity_column = NULL -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{n_bands}{the number of bands used in the locality sensitive hashing -algorithm (default is 100). Use this in conjunction with the -\code{band_width} to determine the performance of the hashing. Generally -speaking, a higher number of bands leads to greater recall at the cost of -higher runtime.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 8) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing. Generally speaking a wider number of bands -decreases the number of false positives, decreasing runtime at the cost of -lower sensitivity (true matches are less likely to be found).} - -\item{progress}{set to \code{TRUE} to print progress} - -\item{clean}{should the strings that you fuzzy join on be cleaned (coerced -to lower-case, stripped of punctuation and spaces)? Default is FALSE} - -\item{similarity_column}{an optional character vector. If provided, the data -frame will contain a column with this name giving the jaccard similarity -between the two fields. Extra column will not be present if anti-joining.} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Fuzzy left-join using minihashing -} diff --git a/man/jaccard-joins.Rd b/man/jaccard-joins.Rd index 6378ed7..2c71d35 100644 --- a/man/jaccard-joins.Rd +++ b/man/jaccard-joins.Rd @@ -6,7 +6,7 @@ \alias{jaccard_left_join} \alias{jaccard_right_join} \alias{jaccard_full_join} -\title{Fuzzy joins using minihashing} +\title{Fuzzy joins for Jaccard distance using MinHash} \usage{ jaccard_inner_join( a, @@ -131,7 +131,7 @@ logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). } \description{ -Fuzzy joins using minihashing +Fuzzy joins for Jaccard distance using MinHash } \examples{ # load baby names data diff --git a/man/jaccard_curve.Rd b/man/jaccard_curve.Rd index 68e3cd0..6c9e140 100644 --- a/man/jaccard_curve.Rd +++ b/man/jaccard_curve.Rd @@ -21,6 +21,6 @@ Plot S-Curve for a LSH with given hyperparameters \examples{ # Plot the probability two pairs will be matched as a function of their # jaccard similarity, given the hyperparameters n_bands and band_width. -jaccard_curve(40,6) +jaccard_curve(40, 6) } diff --git a/man/jaccard_hyper_grid_search.Rd b/man/jaccard_hyper_grid_search.Rd index f650492..a98d4a0 100644 --- a/man/jaccard_hyper_grid_search.Rd +++ b/man/jaccard_hyper_grid_search.Rd @@ -33,6 +33,6 @@ strings with .7 similarity have a 99.9\% chance of being compared. # Help me find the parameters that will minimize runtime while ensuring that # two strings with similarity .1 will be compared less than .1\% of the time, # strings with .8 similaity will have a 99.95\% chance of being compared: -jaccard_hyper_grid_search(.1,.9,.001,.995) +jaccard_hyper_grid_search(.1, .9, .001, .995) } diff --git a/man/jaccard_similarity.Rd b/man/jaccard_similarity.Rd index 4bb330e..6e2f370 100644 --- a/man/jaccard_similarity.Rd +++ b/man/jaccard_similarity.Rd @@ -21,7 +21,9 @@ a vector of jaccard similarities of the strings Calculate Jaccard Similarity of two character vectors } \examples{ -jaccard_similarity(c("the quick brown fox","jumped over the lazy dog"), - c("the quck bron fx","jumped over hte lazy dog")) +jaccard_similarity( + c("the quick brown fox", "jumped over the lazy dog"), + c("the quck bron fx", "jumped over hte lazy dog") +) } diff --git a/man/jaccard_string_group.Rd b/man/jaccard_string_group.Rd index 032c44e..2fd8071 100644 --- a/man/jaccard_string_group.Rd +++ b/man/jaccard_string_group.Rd @@ -59,8 +59,10 @@ in order to use this function. } \examples{ -string <- c("beniamino", "jack", "benjamin", "beniamin", - "jacky", "giacomo", "gaicomo") -jaccard_string_group(string, threshold = .2, n_bands=90, n_gram_width=1) +string <- c( + "beniamino", "jack", "benjamin", "beniamin", + "jacky", "giacomo", "gaicomo" +) +jaccard_string_group(string, threshold = .2, n_bands = 90, n_gram_width = 1) }