man/deduplicate.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/deduplication_functions.R
\name{deduplicate}
\alias{deduplicate}
\title{Remove duplicates from a bibliographic data set}
\usage{
deduplicate(data, match_by, method, type = "merge", ...)
}
\arguments{
\item{data}{A \code{data.frame} containing bibliographic information.}

\item{match_by}{Name of the column in \code{data} where duplicates should be sought.}

\item{method}{The duplicate detection function to use; see see \code{link{string_}} or \code{link{fuzz_}} for examples. Passed to \code{find_duplicates}.}

\item{type}{How should entries be selected? Default is \code{"merge"} which selected the entries with the largest number of characters in each column. Alternatively \code{"select"} returns the row with the highest total number of characters.}

\item{\dots}{Arguments passed to \code{find_duplicates}.}
}
\value{
A \code{data.frame} containing data identified as unique.
}
\description{
Removes duplicates using sensible defaults
}
\details{
This is a wrapper function to \code{\link{find_duplicates}} and \code{extract_unique_references}, which tries to choose some sensible defaults. Use with care.
}
\examples{
my_df <-  data.frame(
  title = c(
    "EviAtlas: a tool for visualising evidence synthesis databases",
    "revtools: An R package to support article screening for evidence synthesis",
    "An automated approach to identifying search terms for systematic reviews",
    "Reproducible, flexible and high-throughput data extraction from primary literature",
    "eviatlas:tool for visualizing evidence synthesis databases.",
    "REVTOOLS a package to support article-screening for evidence synthsis"
  ),
  year = c("2019", "2019", "2019", "2019", NA, NA),
  authors = c("Haddaway et al", "Westgate",
              "Grames et al", "Pick et al", NA, NA),
  stringsAsFactors = FALSE
)

# run deduplication
dups <- find_duplicates(
  my_df$title,
  method = "string_osa",
  rm_punctuation = TRUE,
  to_lower = TRUE
)

extract_unique_references(my_df, matches = dups)

# or, in one line:
deduplicate(my_df, "title",
  method = "string_osa",
  rm_punctuation = TRUE,
  to_lower = TRUE)
}
\seealso{
\code{\link{find_duplicates}} and \code{\link{extract_unique_references}} for underlying functions.
}