-
Notifications
You must be signed in to change notification settings - Fork 10
/
find_duplicates.Rd
69 lines (61 loc) · 2.75 KB
/
find_duplicates.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/deduplication_functions.R
\name{find_duplicates}
\alias{find_duplicates}
\title{Detect duplicate values}
\usage{
find_duplicates(
data,
method = "exact",
group_by,
threshold,
to_lower = FALSE,
rm_punctuation = FALSE
)
}
\arguments{
\item{data}{A character vector containing duplicate bibliographic entries.}
\item{method}{A string indicating how matching should be calculated. Either \code{"exact"} for exact matching (the default), or the name of a function for calculating string distance.}
\item{group_by}{An optional vector, data.frame or list containing data to use as 'grouping' variables; that is, categories within which duplicates should be sought. Defaults to NULL, in which case all entries are compared against all others. Ignored if \code{method = "exact"}.}
\item{threshold}{Numeric: the cutoff threshold for deciding if two strings are duplcates. Sensible values depend on the \code{method} chosen. Defaults to 5 is \code{method = "string_osa"} and must be specified in all other instances except \code{method = "exact"} (where no threshold is required).}
\item{to_lower}{Logical: Should all entries be converted to lower case before calculating string distance? Defaults to FALSE.}
\item{rm_punctuation}{Logical: Should punctuation should be removed before calculating string distance? Defaults to FALSE.}
}
\value{
Returns a vector of duplicate matches, with \code{attributes} listing methods used.
}
\description{
Identifies duplicate bibliographic entries using different duplicate detection methods.
}
\examples{
my_df <- data.frame(
title = c(
"EviAtlas: a tool for visualising evidence synthesis databases",
"revtools: An R package to support article screening for evidence synthesis",
"An automated approach to identifying search terms for systematic reviews",
"Reproducible, flexible and high-throughput data extraction from primary literature",
"eviatlas:tool for visualizing evidence synthesis databases.",
"REVTOOLS a package to support article-screening for evidence synthsis"
),
year = c("2019", "2019", "2019", "2019", NA, NA),
authors = c("Haddaway et al", "Westgate",
"Grames et al", "Pick et al", NA, NA),
stringsAsFactors = FALSE
)
# run deduplication
dups <- find_duplicates(
my_df$title,
method = "string_osa",
rm_punctuation = TRUE,
to_lower = TRUE
)
extract_unique_references(my_df, matches = dups)
# or, in one line:
deduplicate(my_df, "title",
method = "string_osa",
rm_punctuation = TRUE,
to_lower = TRUE)
}
\seealso{
\code{\link{string_}} or \code{\link{fuzz_}} for suitable functions to pass to \code{methods}; \code{\link{extract_unique_references}} and \code{\link{deduplicate}} for higher-level functions.
}