Skip to content

Commit

Permalink
Fix some lints and re-style
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher committed Feb 14, 2024
1 parent fe41c2b commit 3b607a4
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 116 deletions.
12 changes: 5 additions & 7 deletions R/hamming_join_core.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ hamming_join <- function(a, b, mode, by, n_bands, band_width,
stopifnot("'by' vectors must have length 1" = length(by_a) == 1)
stopifnot("'by' vectors must have length 1" = length(by_b) == 1)

stopifnot("There should be no NA's in by_a" = !any(is.na(dplyr::pull(a, by_a))))
stopifnot("There should be no NA's in by_b" = !any(is.na(dplyr::pull(b, by_b))))
stopifnot("There should be no NA's in by_a" = !anyNA(a[[by_a]]))
stopifnot("There should be no NA's in by_b" = !anyNA(b[[by_b]]))

# Clean strings that are matched on
if (clean) {
a_col <- gsub("[[:punct:] ]", "", dplyr::pull(a, by_a))
b_col <- gsub("[[:punct:] ]", "", dplyr::pull(b, by_b))
a_col <- tolower(gsub("[[:punct:] ]", "", dplyr::pull(a, by_a)))
b_col <- tolower(gsub("[[:punct:] ]", "", dplyr::pull(b, by_b)))
} else {
a_col <- dplyr::pull(a, by_a)
b_col <- dplyr::pull(b, by_b)
}

max_chars <- max(c(nchar(a_col), nchar(b_col)))

thresh_prob <- hamming_probability(threshold, max_chars, n_bands, band_width)

if (thresh_prob < .95) {
str <- paste0(
"A pair of records at the threshold (", threshold,
Expand All @@ -44,8 +44,6 @@ hamming_join <- function(a, b, mode, by, n_bands, band_width,
warning(str)
}



match_table <- rust_hamming_join(
a_col, b_col,
band_width, n_bands, threshold,
Expand Down
160 changes: 85 additions & 75 deletions R/hamming_logical_joins.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,100 +73,110 @@
#' band_width = 10,
#' )
hamming_inner_join <- function(a, b,
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column=NULL) {
hamming_join(a, b, mode = "inner",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean=clean)
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column = NULL) {
hamming_join(a, b,
mode = "inner",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean = clean
)
}

#' @rdname hamming-joins
#' @export
hamming_anti_join <- function(a, b,
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column=NULL) {
hamming_join(a, b, mode = "anti",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean=clean)
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column = NULL) {
hamming_join(a, b,
mode = "anti",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean = clean
)
}

#' @rdname hamming-joins
#' @export
hamming_left_join <- function(a, b,
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column=NULL) {
hamming_join(a, b, mode = "left",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean=clean)
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column = NULL) {
hamming_join(a, b,
mode = "left",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean = clean
)
}

#' @rdname hamming-joins
#' @export
hamming_right_join <- function(a, b,
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column=NULL) {
hamming_join(a, b, mode = "right",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean=clean)
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column = NULL) {
hamming_join(a, b,
mode = "right",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean = clean
)
}


#' @rdname hamming-joins
#' @export
hamming_full_join <- function(a, b,
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column=NULL) {
hamming_join(a, b, mode = "full",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean=clean)
by = NULL,
n_bands = 100,
band_width = 100,
threshold = 2,
progress = FALSE,
clean = FALSE,
similarity_column = NULL) {
hamming_join(a, b,
mode = "full",
by = by,
n_bands = n_bands,
band_width = band_width,
threshold = threshold,
progress = progress,
similarity_column = similarity_column,
clean = clean
)
}
12 changes: 7 additions & 5 deletions R/jaccard_similarity.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ jaccard_similarity <- function(a, b, ngram_width = 2) {
#' @return a vector of hamming similarities of the strings
#'
#' @examples
#' hamming_distance(c("the quick brown fox","jumped over the lazy dog"),
#' c("the quck bron fx","jumped over hte lazy dog"))
#' hamming_distance(
#' c("the quick brown fox", "jumped over the lazy dog"),
#' c("the quck bron fx", "jumped over hte lazy dog")
#' )
#'
#' @export
hamming_distance <- function(a, b, ngram_width=2) {
stopifnot(length(a) == length(b))
rust_hamming_distance(a, b)
hamming_distance <- function(a, b, ngram_width = 2) {
stopifnot(length(a) == length(b))
rust_hamming_distance(a, b)
}
24 changes: 12 additions & 12 deletions R/lsh_properties.R
Original file line number Diff line number Diff line change
Expand Up @@ -180,20 +180,20 @@ jaccard_hyper_grid_search <- function(s1 = .1, s2 = .7, p1 = .001, p2 = .999) {
#'
#' @export
hamming_probability <- function(distance, input_length, n_bands, band_width) {
# probability that two strings with distance d have same value for randomly
# chosen bit
p_one_collision <- 1-(distance/input_length)
# probability that two strings with distance d have same value for randomly
# chosen bit
p_one_collision <- 1 - (distance / input_length)

# probability that two strings with distance d have same value for band_width
# randomly chosen bits
p_one_band <- p_one_collision^band_width
# probability that two strings with distance d have same value for band_width
# randomly chosen bits
p_one_band <- p_one_collision^band_width

# probability that two strings with distance d have same value for one of any
# n_bands hashes
# Pr[compared] = 1 - Pr[no hashes match]
# = 1 - Pr[one hash does not match]^n_bands
# probability that two strings with distance d have same value for one of any
# n_bands hashes
# Pr[compared] = 1 - Pr[no hashes match]
# = 1 - Pr[one hash does not match]^n_bands

p_compared <- 1-(1-p_one_band)^n_bands
p_compared <- 1 - (1 - p_one_band)^n_bands

return(p_compared)
return(p_compared)
}
6 changes: 4 additions & 2 deletions man/hamming_distance.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 3 additions & 5 deletions tests/testthat/test-hamming_dist.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
test_that("hamming distance works", {

require(babynames)
require(stringdist)

Expand All @@ -9,12 +8,11 @@ test_that("hamming distance works", {


a <- hamming_distance(nameys, shuff_nameys)
b <- stringdist(nameys, shuff_nameys, method = "hamming")
b <- stringdist(nameys, shuff_nameys, method = "hamming")

expect_true(all((a == Inf) %in% (b==Inf)))
expect_true(all((b == Inf) %in% (a==Inf)))
expect_true(all((a == Inf) %in% (b == Inf)))
expect_true(all((b == Inf) %in% (a == Inf)))

expect_true(all(abs(a[a != Inf] - b[b != Inf]) < .01))

}
})
1 change: 0 additions & 1 deletion tests/testthat/test-test_jaccard_sim.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,3 @@ test_that("jaccard sim works", {
expect_true(all(abs(a - b) < .01))
}
})

6 changes: 2 additions & 4 deletions tests/testthat/test-test_logical_lsh_join.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ test_that("hamming_full_join works on tiny dataset", {

test_that("hamming_left_join works on tiny dataset", {
capture_messages(
test <- hamming_left_join(dataset_1, dataset_2, threshold = 3, band_width = 1, n_bands = 300)
test <- hamming_left_join(dataset_1, dataset_2, threshold = 3, band_width = 1, n_bands = 300)
)

expect_true(all(test$id_1 == test$id_2, na.rm = T))
Expand All @@ -194,7 +194,7 @@ test_that("hamming_left_join works on tiny dataset", {

test_that("hamming_right_join works on tiny dataset", {
capture_messages(
test <- hamming_right_join(dataset_1, dataset_2, threshold = 3, band_width = 1, n_bands = 300)
test <- hamming_right_join(dataset_1, dataset_2, threshold = 3, band_width = 1, n_bands = 300)
)


Expand All @@ -207,7 +207,6 @@ test_that("hamming_right_join works on tiny dataset", {
test_that("jaccard_inner_join gives same results as stringdist_inner_join", {
for (i in 1:20) {
capture_messages({

zoomer_join_out <- hamming_inner_join(names_df, misspelled_name_df, threshold = 3, n_bands = 100, band_width = 1) %>%
arrange(id_1, id_2)

Expand Down Expand Up @@ -266,4 +265,3 @@ test_that("argument `progress` works correctly", {
"generating"
)
})

10 changes: 5 additions & 5 deletions tests/testthat/test-test_lsh_properties.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ test_that("jaccard_hyper_grid_search validates inputs are length 1", {
})

test_that("hamming_probabilitiy gives accurate results", {
expect_equal(hamming_probability(0,10,10,1),1)
expect_equal(hamming_probability(0, 10, 10, 1), 1)

expect_equal(hamming_probability(1,10,1,1),.9)
expect_equal(hamming_probability(1,10,2,1),.99)
expect_equal(hamming_probability(1,10,3,1),.999)
expect_equal(hamming_probability(1, 10, 1, 1), .9)
expect_equal(hamming_probability(1, 10, 2, 1), .99)
expect_equal(hamming_probability(1, 10, 3, 1), .999)

expect_equal(hamming_probability(10,10,10,1),0)
expect_equal(hamming_probability(10, 10, 10, 1), 0)
})

0 comments on commit 3b607a4

Please sign in to comment.