From fe74541329131344427a30b6c13dda53bd89da98 Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 11:19:53 -0400 Subject: [PATCH 1/7] updated version number to be consistent with CRAN --- DESCRIPTION | 4 +-- man/zoomerjoin-package.Rd | 30 ---------------------- src/rust/Cargo.lock | 53 --------------------------------------- 3 files changed, 2 insertions(+), 85 deletions(-) delete mode 100644 man/zoomerjoin-package.Rd diff --git a/DESCRIPTION b/DESCRIPTION index d4463e8..83b03ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: zoomerjoin Title: Superlatively Fast Fuzzy Joins -Version: 0.1.2.9000 +Version: 0.1.4.9000 Authors@R: c( - person("Beniamino", "Green", , "beniamino.green@yale.edu", role = c("aut", "cre", "cph")), + person("Beniamino", "Green", , "beniamino.green@tuta.com", role = c("aut", "cre", "cph")), person("Etienne", "Bacher", email = "etienne.bacher@protonmail.com", role = "ctb", comment = c(ORCID = "0000-0002-9271-5075")), person(given = "The authors of the dependency Rust crates", diff --git a/man/zoomerjoin-package.Rd b/man/zoomerjoin-package.Rd deleted file mode 100644 index da4dca2..0000000 --- a/man/zoomerjoin-package.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/extendr-wrappers.R -\docType{package} -\name{zoomerjoin-package} -\alias{zoomerjoin} -\alias{zoomerjoin-package} -\title{zoomerjoin: Superlatively Fast Fuzzy Joins} -\description{ -\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} - -Empowers users to fuzzily-merge data frames with millions or tens of millions of rows in minutes with low memory usage. The package uses the locality sensitive hashing algorithms developed by Datar, Immorlica, Indyk and Mirrokni (2004) \doi{10.1145/997817.997857}, and Broder (1998) \doi{10.1109/SEQUEN.1997.666900} to avoid having to compare every pair of records in each dataset, resulting in fuzzy-merges that finish in linear time. -} -\seealso{ -Useful links: -\itemize{ - \item \url{https://beniamino.org/zoomerjoin/} - \item Report bugs at \url{https://github.com/beniaminogreen/zoomerjoin/issues/} -} - -} -\author{ -\strong{Maintainer}: Beniamino Green \email{beniamino.green@yale.edu} [copyright holder] - -Other contributors: -\itemize{ - \item Etienne Bacher \email{etienne.bacher@protonmail.com} (\href{https://orcid.org/0000-0002-9271-5075}{ORCID}) [contributor] - \item The authors of the dependency Rust crates (see inst/AUTHORS file for details) [contributor, copyright holder] -} - -} diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index c6feb10..d446586 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -7,24 +7,20 @@ name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "crossbeam-deque" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "cfg-if", "crossbeam-epoch", @@ -35,7 +31,6 @@ dependencies = [ name = "crossbeam-epoch" version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "autocfg", "cfg-if", @@ -48,7 +43,6 @@ dependencies = [ name = "crossbeam-utils" version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "cfg-if", ] @@ -57,7 +51,6 @@ dependencies = [ name = "dashmap" version = "5.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "cfg-if", "hashbrown", @@ -71,12 +64,10 @@ name = "either" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "extendr-api" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "extendr-macros", "libR-sys", @@ -89,7 +80,6 @@ dependencies = [ name = "extendr-macros" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "proc-macro2", "quote", @@ -100,7 +90,6 @@ dependencies = [ name = "getrandom" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "cfg-if", "libc", @@ -112,12 +101,10 @@ name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "itertools" version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "either", ] @@ -127,24 +114,20 @@ name = "libR-sys" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "libc" version = "0.2.151" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "libm" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "lock_api" version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "autocfg", "scopeguard", @@ -154,7 +137,6 @@ dependencies = [ name = "matrixmultiply" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "autocfg", "rawpointer", @@ -164,7 +146,6 @@ dependencies = [ name = "memoffset" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "autocfg", ] @@ -173,7 +154,6 @@ dependencies = [ name = "ndarray" version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "matrixmultiply", "num-complex", @@ -187,7 +167,6 @@ dependencies = [ name = "ndarray-rand" version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "ndarray", "rand", @@ -199,12 +178,10 @@ name = "nohash-hasher" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "num-complex" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "num-traits", ] @@ -213,7 +190,6 @@ dependencies = [ name = "num-integer" version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "autocfg", "num-traits", @@ -223,7 +199,6 @@ dependencies = [ name = "num-traits" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "autocfg", "libm", @@ -234,12 +209,10 @@ name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "parking_lot_core" version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "cfg-if", "libc", @@ -253,18 +226,15 @@ name = "paste" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "proc-macro2" version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "unicode-ident", ] @@ -273,7 +243,6 @@ dependencies = [ name = "quote" version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "proc-macro2", ] @@ -282,7 +251,6 @@ dependencies = [ name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "libc", "rand_chacha", @@ -293,7 +261,6 @@ dependencies = [ name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "ppv-lite86", "rand_core", @@ -303,7 +270,6 @@ dependencies = [ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "getrandom", ] @@ -312,7 +278,6 @@ dependencies = [ name = "rand_distr" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "num-traits", "rand", @@ -323,12 +288,10 @@ name = "rawpointer" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "rayon" version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "either", "rayon-core", @@ -338,7 +301,6 @@ dependencies = [ name = "rayon-core" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -348,7 +310,6 @@ dependencies = [ name = "redox_syscall" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "bitflags", ] @@ -358,24 +319,20 @@ name = "rustc-hash" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "smallvec" version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "syn" version = "2.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "proc-macro2", "quote", @@ -387,18 +344,15 @@ name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -414,43 +368,36 @@ name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "zoomerjoin" version = "0.1.0" From 8d806fa39dcfeeca5a267f9bd996586be7a6aadf Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 13:01:05 -0400 Subject: [PATCH 2/7] updated roxygen2 version --- DESCRIPTION | 2 +- NEWS.md | 7 ++++--- R/extendr-wrappers.R | 1 - README.md | 48 ++++++++++++++++++++++---------------------- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 83b03ec..1b931f6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: Empowers users to fuzzily-merge data frames with millions or tens o License: GPL (>= 3) Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 SystemRequirements: Cargo (>= 1.56) (Rust's package manager), rustc Imports: collapse, diff --git a/NEWS.md b/NEWS.md index 411c44b..7b925de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,17 +1,18 @@ -# zoomerjoin (development version) +# zoomerjoin 0.1.5 ## New features * Several performance improvements (#101, #104). * Added support for joining based on hamming distance (#100). +* Upgraded `extendr` version (#121) ## Bug fixes * When `clean = TRUE`, strings were not coerced to lower case. This is now the case (#105). -* Fix argument `progress`, which didn't print anything when it was `TRUE` (#107). +* Fix argument `progress`, which was inoperative (#107). -# zoomerjoin 0.1.2 +# zoomerjoin 0.1.4 * Submitted Package to CRAN * Add support for new `join_by()` syntax diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 7db7c09..c4a9ccc 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -6,7 +6,6 @@ # This file was created with the following call: # .Call("wrap__make_zoomerjoin_wrappers", use_symbols = TRUE, package_name = "zoomerjoin") -#' @docType package #' @usage NULL #' @useDynLib zoomerjoin, .registration = TRUE NULL diff --git a/README.md b/README.md index 871c82f..449fbb5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Please be aware that you will have to have Cargo (the rust toolchain and compiler) installed to build the package from source. ``` r -install.packages('zoomerjoin') +install.packages(zoomerjoin) ``` ### Installing from R-Universe: @@ -138,7 +138,7 @@ I start with two corpuses I would like to combine, `corpus_1`: ``` r corpus_1 <- dime_data %>% - head(500) + head(500) names(corpus_1) <- c("a", "field") corpus_1 ``` @@ -162,7 +162,7 @@ And `corpus_2`: ``` r corpus_2 <- dime_data %>% - tail(500) + tail(500) names(corpus_2) <- c("b", "field") corpus_2 ``` @@ -205,7 +205,7 @@ vignette](https://beniamino.org/zoomerjoin/articles/guided_tour.html). ``` r set.seed(1) start_time <- Sys.time() -join_out <- jaccard_inner_join(corpus_1, corpus_2, n_gram_width=6, n_bands=20, band_width=6) +join_out <- jaccard_inner_join(corpus_1, corpus_2, n_gram_width = 6, n_bands = 20, band_width = 6) ``` ## Warning in jaccard_join(a, b, mode = "inner", by = by, salt_by = block_by, : A pair of records at the threshold (0.7) have only a 92% chance of being compared. @@ -217,7 +217,7 @@ join_out <- jaccard_inner_join(corpus_1, corpus_2, n_gram_width=6, n_bands=20, b print(Sys.time() - start_time) ``` - ## Time difference of 0.01455116 secs + ## Time difference of 0.03253984 secs ``` r print(join_out) @@ -226,25 +226,25 @@ print(join_out) ## # A tibble: 19 × 4 ## a field.x b field.y ## - ## 1 216 kent county republican finance committee 607 lake co… - ## 2 238 4th congressional district democratic party 518 16th co… - ## 3 292 bill bradley for u s senate '84 913 bill br… - ## 4 378 guarini for congress 1982 606 guarini… - ## 5 232 republican county committee of chester county 710 republi… - ## 6 387 committee to re elect congressman staton 805 committ… - ## 7 122 tarrant county republican victory fund 761 lake co… - ## 8 378 guarini for congress 1982 883 guarini… - ## 9 238 4th congressional district democratic party 792 8th con… - ## 10 88 scheuer for congress 1980 667 scheuer… - ## 11 45 dole for senate committee 623 riegle … - ## 12 87 kentucky state democratic central executive committee 639 arizona… - ## 13 319 7th congressional district democratic party of wisconsin 792 8th con… - ## 14 478 united democrats for better government 642 democra… - ## 15 163 davies county republican executive committee 852 warren … - ## 16 230 pipefitters local union 524 998 pipefit… - ## 17 216 kent county republican finance committee 719 harford… - ## 18 302 americans for good government inc 910 america… - ## 19 35 solarz for congress 82 671 solarz … + ## 1 88 scheuer for congress 1980 667 scheuer… + ## 2 35 solarz for congress 82 671 solarz … + ## 3 378 guarini for congress 1982 883 guarini… + ## 4 163 davies county republican executive committee 852 warren … + ## 5 87 kentucky state democratic central executive committee 639 arizona… + ## 6 302 americans for good government inc 910 america… + ## 7 216 kent county republican finance committee 719 harford… + ## 8 319 7th congressional district democratic party of wisconsin 792 8th con… + ## 9 122 tarrant county republican victory fund 761 lake co… + ## 10 238 4th congressional district democratic party 792 8th con… + ## 11 387 committee to re elect congressman staton 805 committ… + ## 12 478 united democrats for better government 642 democra… + ## 13 45 dole for senate committee 623 riegle … + ## 14 216 kent county republican finance committee 607 lake co… + ## 15 230 pipefitters local union 524 998 pipefit… + ## 16 232 republican county committee of chester county 710 republi… + ## 17 292 bill bradley for u s senate '84 913 bill br… + ## 18 378 guarini for congress 1982 606 guarini… + ## 19 238 4th congressional district democratic party 518 16th co… Zoomerjoin is able to quickly find the matching columns without comparing all pairs of records. This saves more and more time as the From 7b9af80d843876575a1381f1e528ba72a6a2bc57 Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 13:15:15 -0400 Subject: [PATCH 3/7] edited _pkgdown.yml to remove dummy entry --- _pkgdown.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index a1cbce9..dfa73b4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -52,7 +52,3 @@ reference: - title: Data contents: - dime_data - - - title: Miscellaneous - contents: - - zoomerjoin-package From 7076fad7cd9d07b1679779447a1c0ad3fa8fa96e Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 13:53:28 -0400 Subject: [PATCH 4/7] updated DESCRIPTION, NEWS.md, cran-comments.md --- DESCRIPTION | 4 ++-- NEWS.md | 4 ++-- cran-comments.md | 13 ++----------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1b931f6..b442651 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: zoomerjoin Title: Superlatively Fast Fuzzy Joins -Version: 0.1.4.9000 +Version: 0.1.5 Authors@R: c( - person("Beniamino", "Green", , "beniamino.green@tuta.com", role = c("aut", "cre", "cph")), + person("Beniamino", "Green", , "beniamino.green@yale.edu", role = c("aut", "cre", "cph")), person("Etienne", "Bacher", email = "etienne.bacher@protonmail.com", role = "ctb", comment = c(ORCID = "0000-0002-9271-5075")), person(given = "The authors of the dependency Rust crates", diff --git a/NEWS.md b/NEWS.md index 7b925de..cf8b324 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,13 +4,13 @@ * Several performance improvements (#101, #104). * Added support for joining based on hamming distance (#100). -* Upgraded `extendr` version (#121) +* Bumped `extendr` to v0.7.0 (#121) ## Bug fixes * When `clean = TRUE`, strings were not coerced to lower case. This is now the case (#105). -* Fix argument `progress`, which was inoperative (#107). +* Fix argument `progress`, which didn't print anything when it was `TRUE` (#107). # zoomerjoin 0.1.4 diff --git a/cran-comments.md b/cran-comments.md index abdcc77..0037a2f 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,12 +1,3 @@ -## Resubmission -This is a resubmission. In this version I have: - -* Added DOI's and author names to DESCRIPTION file. -* Removed usage of installed.packages to detect if optional dependency `igraph` is installed. - -There is one note about possibly misspelled words in the DESCRIPTION. These are author names. - -Many thanks for your help! -Ben - +## R CMD check results +0 errors | 0 warnings | 0 notes From b1feb9fe9272e31c2089ba650bc0ad8e7c384b72 Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 14:00:20 -0400 Subject: [PATCH 5/7] NEWS.md tweaks --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index cf8b324..1ace7d3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,9 +8,9 @@ ## Bug fixes -* When `clean = TRUE`, strings were not coerced to lower case. This is now the +* Fixed bug where when `clean = TRUE`, strings were not coerced to lower case. case (#105). -* Fix argument `progress`, which didn't print anything when it was `TRUE` (#107). +* Fix argument `progress`, was inoperative (#107). # zoomerjoin 0.1.4 From bad7257e033d1d220713e4d54f4b11576f3cc39f Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 14:02:33 -0400 Subject: [PATCH 6/7] Add GitHub links to DESCRIPTION --- DESCRIPTION | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b442651..c4a38f9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,12 +35,11 @@ Suggests: tidyverse, vdiffr Config/testthat/edition: 3 -URL: https://beniamino.org/zoomerjoin/ -BugReports: https://github.com/beniaminogreen/zoomerjoin/issues/ +URL: https://beniamino.org/zoomerjoin/, https://github.com/beniaminogreen/zoomerjoin +BugReports: https://github.com/beniaminogreen/zoomerjoin/issues VignetteBuilder: knitr Depends: R (>= 2.10) LazyData: true LazyDataCompression: xz Config/rextendr/version: 0.3.1.9000 - From bc052315aabe2c9a83bb9af78bef314bed038b7c Mon Sep 17 00:00:00 2001 From: Beniamino Green Date: Tue, 2 Jul 2024 15:11:54 -0400 Subject: [PATCH 7/7] fixed typo --- NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ace7d3..393c68c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,8 +8,7 @@ ## Bug fixes -* Fixed bug where when `clean = TRUE`, strings were not coerced to lower case. - case (#105). +* Fixed bug where when `clean = TRUE`, strings were not coerced to lower case (#105). * Fix argument `progress`, was inoperative (#107). # zoomerjoin 0.1.4