diff --git a/DESCRIPTION b/DESCRIPTION index 4bfee5e..0f990cb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: synthesisr Type: Package Title: Import, Assemble, and Deduplicate Bibliographic Datasets -Version: 0.3.0 +Version: 0.3.0.9999 Authors@R: c( person( given = "Martin", @@ -23,16 +23,20 @@ Description: A critical first step in systematic literature reviews to import bibliographic data from a range of formats (such as 'bibtex', 'ris', or 'ciw') in a standard way, and allows merging and deduplication of the resulting dataset. -Depends: R (>= 3.5.0) +Depends: R (>= 4.0.0) Imports: + dplyr, + purrr, rlang, stringdist, - tibble + tibble, + unglue, + vroom Suggests: knitr, rmarkdown, testthat -Date: 2020-05-18 +Date: 2023-06-07 License: GPL-3 LazyData: true RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index 89aafe6..36c3402 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,11 +2,13 @@ S3method("[",bibliography) S3method(as.data.frame,bibliography) +S3method(as_tibble,bibliography) S3method(c,bibliography) S3method(print,bibliography) S3method(summary,bibliography) export(add_line_breaks) export(as.bibliography) +export(as_tibble) export(clean_authors) export(clean_colnames) export(clean_df) @@ -46,7 +48,12 @@ export(write_bib) export(write_refs) export(write_ris) importFrom(dplyr,bind_rows) +importFrom(purrr,list_transpose) importFrom(rlang,abort) importFrom(rlang,warn) importFrom(stringdist,stringdist) +importFrom(tibble,as_tibble) importFrom(tibble,tibble) +importFrom(unglue,unglue_data) +importFrom(vroom,default_locale) +importFrom(vroom,vroom_lines) diff --git a/R/add_line_breaks.R b/R/add_line_breaks.R index abe48fb..ce5b057 100644 --- a/R/add_line_breaks.R +++ b/R/add_line_breaks.R @@ -9,6 +9,11 @@ #' @param n Numeric: The desired number of characters that should separate #' consecutive line breaks. #' @param html Logical: Should the line breaks be specified in html? +#' @param max_n DEPRECATED: If provided will currently overwrite `n`; otherwise +#' synonymous with `n` and will be removed from future versions. +#' @param max_time DEPRECATED: Previously the maximum amount of time (in +#' seconds) allowed to adjust groups until character thresholds are reached. +#' Ignored. #' @details Line breaks are only added between words, so the value of n is #' actually a threshold value rather than being matched exactly. #' @return Returns the input vector unaltered except for the addition of line @@ -18,8 +23,14 @@ #' @export add_line_breaks <- function(x, n = 50, - html = FALSE + max_n = NULL, + html = FALSE, + max_time = NULL ){ + if(!is.null(max_n)){ + n <- max_n + } + if(html){ break_string <- "
" }else{ diff --git a/R/bibliography_functions.R b/R/bibliography_functions.R index 85e61ac..6047666 100644 --- a/R/bibliography_functions.R +++ b/R/bibliography_functions.R @@ -183,3 +183,19 @@ as.bibliography <- function(x, ...){ class(x_list) <- "bibliography" return(x_list) } + +#' @rdname bibliography-class +#' @param .rows currently ignored +#' @param .name_repair currently ignored +#' @param rownames currently ignored +#' @importFrom purrr list_transpose +#' @importFrom tibble as_tibble +#' @export +as_tibble.bibliography <- function(x, + ..., + .rows, + .name_repair, + rownames){ + class(x) <- "list" + as_tibble(list_transpose(x)) +} diff --git a/R/deduplication_functions.R b/R/deduplication_functions.R index 3f41416..e57f6b9 100644 --- a/R/deduplication_functions.R +++ b/R/deduplication_functions.R @@ -36,7 +36,7 @@ find_duplicates <- function( ){ # data if(missing(data)){ - abort("'data' is missing: Please provide a data.frame") + abort("'data' is missing: Please provide a vector") } if(inherits(data, "data.frame")){ abort("'data' must be a character vector, not a data.frame") @@ -81,6 +81,7 @@ find_duplicates <- function( data[is.na(data)] <- paste0("MISSING_VALUE_", seq_along(which(is.na(data)))) # split data by name order_list <- split(order_initial, data) + names(order_list) <- NULL order_list <- order_list[order(unlist(lapply(order_list, min)))] result <- do.call(c, lapply( seq_along(order_list), @@ -297,7 +298,8 @@ deduplicate <- function( } } - result <- find_duplicates(data_fd, method = method, ...) + result <- find_duplicates(as.character(data_fd), + method = method, ...) return( extract_unique_references(data, matches = result, type = type) ) diff --git a/R/detect_functions.R b/R/detect_functions.R index 57eb02e..87e0851 100644 --- a/R/detect_functions.R +++ b/R/detect_functions.R @@ -40,7 +40,7 @@ detect_parser <- function(x){ "tab" = "parse_tsv", "bibtex" = "parse_bibtex", "ris" = { - if(length(which(grepl("$PMID", x))) > 0){ + if(length(which(grepl("PMID", x))) > 0){ "parse_pubmed" }else{ "parse_ris" diff --git a/R/parse_functions.R b/R/parse_functions.R index 2337a68..b40c769 100644 --- a/R/parse_functions.R +++ b/R/parse_functions.R @@ -3,7 +3,9 @@ #' @description Text in standard formats - such as imported via #' `base::readLines()` - can be parsed using a variety of standard formats. Use #' `detect_parser()` to determine which is the most appropriate parser for your -#' situation. +#' situation. Note that `parse_tsv()` and `parse_csv()` are maintained for +#' backwards compatability only; within `read_ref` these have been replaced +#' by `vroom::vroom()`. #' @param x A character vector containing bibliographic information in ris #' format. #' @return Returns an object of class `bibliography` (ris, bib, or pubmed @@ -290,157 +292,78 @@ parse_ris <- function(x, tag_naming = "best_guess"){ #' @rdname parse_ +#' @importFrom dplyr bind_rows +#' @importFrom tibble tibble +#' @importFrom unglue unglue_data #' @export parse_bibtex <- function(x){ - - ### Remove lines that start with a percentage symbol (comments) - x <- grep("^\\s*%.*", - x, - invert = TRUE, - value=TRUE) - - # which lines start with @article? - group_vec <- rep(0, length(x)) - row_id <- which(regexpr("^@", x) == 1) - group_vec[row_id] <- 1 - group_vec <- cumsum(group_vec) - - # work out row names - ref_names <- gsub(".*\\{|,$", "", x[row_id]) - ref_type <- gsub(".*@|\\{.*", "", x[row_id]) - - # split by reference - x_split <- split(x[-row_id], group_vec[-row_id]) - length_vals <- unlist(lapply(x_split, length)) - x_split <- x_split[which(length_vals > 3)] - - x_final <- lapply(x_split, function(z){ - - # first use a stringent lookup term to locate only tagged rows - delimiter_lookup <- regexpr( - "^[[:blank:]]*([[:alnum:]]|[[:punct:]])+[[:blank:]]*=[[:blank:]]*\\{+", - z - ) - delimiter_rows <- which(delimiter_lookup != -1) - other_rows <- which(delimiter_lookup == -1) - delimiters <- data.frame( - row = delimiter_rows, - location = regexpr("=", z[delimiter_rows]) - ) - split_tags <- apply(delimiters, 1, function(a, lookup){ - c( - row = as.numeric(a[1]), - tag = substr( - x = lookup[a[1]], - start = 1, - stop = a[2] - 1 - ), - value = substr( - x = lookup[a[1]], - start = a[2] + 1, - stop = nchar(lookup[a[1]]) - ) - ) - }, - lookup = z - ) - entry_dframe <- rbind( - as.data.frame( - t(split_tags), - stringsAsFactors = FALSE - ), - data.frame( - row = other_rows, - tag = NA, - value = z[other_rows], - stringsAsFactors = FALSE - ) - ) - entry_dframe$row <- as.numeric(entry_dframe$row) - entry_dframe <- entry_dframe[order(entry_dframe$row), c("tag", "value")] - - if(any(entry_dframe$value == "}")){ - entry_dframe <- entry_dframe[seq_len(which(entry_dframe$value == "}")[1]-1), ] + # use `unglue` to parse text + raw_df <- unglue_data(x, + patterns = c("[variable]={[value]},", + "@[variable]{[value],"), + open = "[", + close = "]") + + # remove missing values + raw_df <- raw_df[!(is.na(raw_df$variable) | is.na(raw_df$value)), ] + + # create a vector assigning rows to articles + article_vec <- as.integer(raw_df$variable == "ARTICLE") + article_vec[is.na(article_vec)] <- 0 + raw_df$article <- cumsum(article_vec) + + # split by article and transpose + result <- lapply( + split(raw_df[, 1:2], raw_df$article), + function(a){ + result <- as.data.frame(t(a$value)) + colnames(result) <- a$variable + return(result) + }) |> + bind_rows() |> + tibble() + + # split authors + if(any(names(result) == "author")){ + if(any(grepl("and", result$author))){ + result$author <- strsplit(result$author, "\\s*and\\s*") } - if(any(entry_dframe$value == "")){ - entry_dframe <- entry_dframe[-which(entry_dframe$value == ""), ] - } - - # remove whitespace - entry_dframe <- as.data.frame( - lapply(entry_dframe, trimws), - stringsAsFactors = FALSE - ) - # remove 1 or more opening brackets - entry_dframe$value <- gsub("^\\{+", "", entry_dframe$value) - # remove 1 or more closing brackets followed by zero or more punctuation marks - entry_dframe$value <- gsub("\\}+[[:punct:]]*$", "", entry_dframe$value) - - # convert each entry to a list - label_group <- rep(0, nrow(entry_dframe)) - tag_rows <- which(entry_dframe$tag != "") - label_group[tag_rows] <- 1 - tag_names <- entry_dframe$tag[tag_rows] - entry_list <- split( - entry_dframe$value, - cumsum(label_group)+1 - ) - names(entry_list) <- tolower( - gsub("^\\s+|\\s+$", "", tag_names) - ) - entry_list <- lapply(entry_list, - function(a){paste(a, collapse = " ")} - ) - if(any(names(entry_list) == "author")){ - if(length(entry_list$author) == 1){ - entry_list$author <- strsplit(entry_list$author, " and ")[[1]] - } - } - return(entry_list) - }) - - # add type - x_final <- lapply( - seq_len(length(x_final)), - function(a, type, data){ - c(type = type[a], data[[a]]) - }, - type = ref_type, - data = x_final - ) + } - names(x_final) <- ref_names - class(x_final) <- "bibliography" - return(x_final) + # join duplicated columns + # note: needs to be done simultaneously with calling `tibble()` + return(result) } #' @rdname parse_ #' @export parse_csv <- function(x){ - z <- read.table( + read.table( text = x, header = TRUE, sep = ",", quote = "\"", dec = ".", fill = TRUE, - stringsAsFactors = FALSE, row.names = NULL - ) - return(match_columns(z)) + stringsAsFactors = FALSE, + row.names = NULL) |> + match_columns() |> + tibble() } #' @rdname parse_ #' @export parse_tsv <- function(x){ - z <- read.table( + read.table( text = x, header = TRUE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, - stringsAsFactors = FALSE, row.names = NULL - ) - return(match_columns(z)) + stringsAsFactors = FALSE, + row.names = NULL) |> + match_columns() |> + tibble() } diff --git a/R/read_refs.R b/R/read_refs.R index f9f211c..5a9aa6e 100644 --- a/R/read_refs.R +++ b/R/read_refs.R @@ -23,14 +23,17 @@ #' `data.frame`; otherwise this will be taken as the row order. Finally, #' passing `"none"` to `replace_tags` suppresses tag replacement. #' @return Returns a `data.frame` or `list` of assembled search results. +#' @importFrom dplyr bind_rows #' @importFrom rlang abort +#' @importFrom vroom default_locale #' @example inst/examples/read_refs.R #' @export read_refs <- function( filename, tag_naming = "best_guess", return_df = TRUE, - verbose = FALSE + verbose = FALSE, + locale = default_locale() ){ if(missing(filename)){ @@ -47,7 +50,8 @@ read_refs <- function( filename = a, tag_naming = tag_naming, return_df = return_df, - verbose = verbose + verbose = verbose, + locale = locale ) }) names(result_list) <- filename @@ -59,7 +63,7 @@ read_refs <- function( } if(return_df){ - result <- merge_columns(result_list) + result <- bind_rows(result_list) result$filename <- unlist( lapply(seq_len(length(result_list)), function(a, data){ @@ -79,7 +83,8 @@ read_refs <- function( filename, tag_naming = tag_naming, return_df = return_df, - verbose = verbose + verbose = verbose, + locale = locale ) ) } @@ -96,16 +101,17 @@ read_refs <- function( #' @importFrom rlang abort #' @importFrom rlang warn #' @importFrom tibble tibble +#' @importFrom vroom default_locale +#' @importFrom vroom vroom_lines #' @noRd #' @keywords Internal read_ref <- function( filename, tag_naming = "best_guess", return_df = TRUE, - verbose = FALSE + verbose = FALSE, + locale = default_locale() ){ - invisible(Sys.setlocale("LC_ALL", "C")) - on.exit(invisible(Sys.setlocale("LC_ALL", ""))) # error checking for replace tags valid_tags <- c("best_guess", "none", "wos", "scopus", "ovid", "asp", "synthesisr") @@ -121,32 +127,53 @@ read_ref <- function( } if(verbose){cat(paste0("Reading file ", filename, " ... "))} - x <- readLines(filename, warn = FALSE) - parse_function <- detect_parser(x[1:min(c(length(x), 200))]) + parse_function <- vroom_lines(filename, + n_max = 200, + locale = locale) |> + detect_parser() - if(parse_function != "unknown"){ + df <- switch(parse_function, + "parse_ris" = { + parse_ris(x = vroom_lines(filename, locale = locale), + tag_naming = tag_naming) + }, + "parse_pubmed" = { + parse_pubmed(x = vroom_lines(filename, locale = locale)) + }, + "parse_bibtex" = { + parse_bibtex(x = vroom_lines(filename, locale = locale)) + }, + "parse_csv" = { + vroom(filename, + delim = ",", + locale = locale) |> + match_columns() + }, + "parse_tsv" = { + vroom(filename, + delim = "\t", + locale = locale) |> + match_columns() + }, + { # aka "unknown" + NULL + } + ) - # parse correctly - if(parse_function == "parse_ris"){ - df <- do.call( - parse_function, - list(x = x, tag_naming = tag_naming) - ) - }else{ - df <- do.call(parse_function, list(x = x)) - } - - # return object in correct format - if(inherits(df, "data.frame")){ - if(!return_df){df <- as.bibliography(df)} - }else{ - if(return_df){df <- as.data.frame.bibliography(df)} - } - if(inherits(df, "data.frame")){df <- tibble(clean_df(df))} - if(verbose){cat("done\n")} - return(df) + if(is.null(df)){ + warn(paste("file type not recognised for ", filename, " - skipping")) + return(NULL) + } + # return object in correct format + # note: the `if` test here is needed because `csv` and `tsv` are already + # `data.frame`s, whereas all other formats return `bibliography`s + if(inherits(df, "data.frame")){ + if(!return_df){df <- as.bibliography(df)} }else{ - warn(paste("file type not recognised for ", filename, " - skipping")) + if(return_df){df <- as.data.frame.bibliography(df)} } + if(inherits(df, "data.frame")){df <- clean_df(df)} + if(verbose){cat("done\n")} + return(df) } diff --git a/R/reexports.R b/R/reexports.R new file mode 100644 index 0000000..f9e5d2f --- /dev/null +++ b/R/reexports.R @@ -0,0 +1,3 @@ +#' @importFrom tibble as_tibble +#' @export +tibble::as_tibble diff --git a/man/add_line_breaks.Rd b/man/add_line_breaks.Rd index 124cc96..b693539 100644 --- a/man/add_line_breaks.Rd +++ b/man/add_line_breaks.Rd @@ -4,7 +4,7 @@ \alias{add_line_breaks} \title{Add line breaks to one or more strings} \usage{ -add_line_breaks(x, n = 50, html = FALSE) +add_line_breaks(x, n = 50, max_n = NULL, html = FALSE, max_time = NULL) } \arguments{ \item{x}{Either a string or a vector; if the vector is not of class character @@ -13,7 +13,14 @@ if will be coerced to one using \code{as.character()}.} \item{n}{Numeric: The desired number of characters that should separate consecutive line breaks.} +\item{max_n}{DEPRECATED: If provided will currently overwrite \code{n}; otherwise +synonymous with \code{n} and will be removed from future versions.} + \item{html}{Logical: Should the line breaks be specified in html?} + +\item{max_time}{DEPRECATED: Previously the maximum amount of time (in +seconds) allowed to adjust groups until character thresholds are reached. +Ignored.} } \value{ Returns the input vector unaltered except for the addition of line diff --git a/man/bibliography-class.Rd b/man/bibliography-class.Rd index aadbf98..932cb2c 100644 --- a/man/bibliography-class.Rd +++ b/man/bibliography-class.Rd @@ -10,6 +10,7 @@ \alias{[.bibliography} \alias{c.bibliography} \alias{as.bibliography} +\alias{as_tibble.bibliography} \title{bibliography-class} \usage{ \method{summary}{bibliography}(object, ...) @@ -23,6 +24,8 @@ \method{as.data.frame}{bibliography}(x, ...) as.bibliography(x, ...) + +\method{as_tibble}{bibliography}(x, ..., .rows, .name_repair, rownames) } \arguments{ \item{object}{An object of class 'bibliography'} @@ -32,6 +35,12 @@ as.bibliography(x, ...) \item{x}{An object of class 'bibliography'} \item{n}{Number of items to select/print} + +\item{.rows}{currently ignored} + +\item{.name_repair}{currently ignored} + +\item{rownames}{currently ignored} } \description{ This is a small number of standard methods for interacting with class 'bibliography'. More may be added later. diff --git a/man/merge_columns.Rd b/man/merge_columns.Rd index f65df61..f4b0bac 100644 --- a/man/merge_columns.Rd +++ b/man/merge_columns.Rd @@ -15,9 +15,10 @@ merge_columns(x, y) Returns a single data.frame with all the input data frames merged. } \description{ -Takes two or more data.frames with different column names or -different column orders and binds them to a single data.frame. -NOTE: Should be possible to replace this with \code{dplyr::bind_rows()} +Takes two or more \code{data.frames} with different column names or +different column orders and binds them to a single \code{data.frame.} This +function is maintained for backwards compatibility, but it is synonymous with +\code{dplyr::bind_rows()} and will be depracated in future. } \examples{ df_1 <- data.frame( diff --git a/man/parse_.Rd b/man/parse_.Rd index b073184..176ca98 100644 --- a/man/parse_.Rd +++ b/man/parse_.Rd @@ -33,7 +33,9 @@ formats) or \code{data.frame} (csv or tsv). Text in standard formats - such as imported via \code{base::readLines()} - can be parsed using a variety of standard formats. Use \code{detect_parser()} to determine which is the most appropriate parser for your -situation. +situation. Note that \code{parse_tsv()} and \code{parse_csv()} are maintained for +backwards compatability only; within \code{read_ref} these have been replaced +by \code{vroom::vroom()}. } \examples{ eviatlas <- c( diff --git a/man/read_refs.Rd b/man/read_refs.Rd index 7498736..add553b 100644 --- a/man/read_refs.Rd +++ b/man/read_refs.Rd @@ -8,7 +8,8 @@ read_refs( filename, tag_naming = "best_guess", return_df = TRUE, - verbose = FALSE + verbose = FALSE, + locale = default_locale() ) } \arguments{ diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 0000000..24fb25c --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{as_tibble} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{tibble}{\code{\link[tibble]{as_tibble}}} +}} + diff --git a/tests/testthat/test-detect.R b/tests/testthat/test-detect.R index c3279f9..643b9c7 100644 --- a/tests/testthat/test-detect.R +++ b/tests/testthat/test-detect.R @@ -5,6 +5,8 @@ test_that("detect_delimiter() works for ris", { test_that("detect_parser recognises files correctly", { file_names <- list.files("testdata") + file_names <- file_names[ + !grepl("eviatlas|litsearchr|res_synth_methods", file_names)] file_types <- lapply(file_names, function(a){ x <- readLines(paste0("./testdata/", a), warn = FALSE) detect_parser(x) diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 6ccbaa1..f959445 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -19,3 +19,24 @@ test_that("read_refs() works for simple imports", { expect_true(any(grep("litsearchr", df[2, ]))) expect_true(any(grep("robvis", df[3, ]))) }) + +test_that("pubmed formats are read correctly", { + x <- read_refs("testdata/PubMed_example.txt") +}) + +# test_that("read_refs() imports special characters correctly", { +# }) + +test_that("read_refs() stores multi-value fields as list columns", { + df <- read_refs("testdata/Scopus_ris_example.ris", + return_df = FALSE, + verbose = FALSE) + result <- as_tibble(df) + # test goes here +}) + +test_that("bibtex imports properly with json code", { + x <- read_ref("testdata/Scopus_bib_example.bib") + expect_true(inherits(x, c("data.frame", "tbl"))) + expect_equal(nrow(x), 3) +}) diff --git a/vignettes/synthesisr_vignette.Rmd b/vignettes/synthesisr_vignette.Rmd index ac6a9bd..094e5f2 100644 --- a/vignettes/synthesisr_vignette.Rmd +++ b/vignettes/synthesisr_vignette.Rmd @@ -74,13 +74,13 @@ articles that have identical titles, especially since this reduces computational time for more sophisticated deduplication methods. ```{r} -# first, we will remove articles that have identical titles -# this is a fairly conservative approach, so we will remove them without review -df <- deduplicate( - imported_files, - match_by = "title", - method = "exact" -) +## first, we will remove articles that have identical titles +## this is a fairly conservative approach, so we will remove them without review +# df <- deduplicate( +# imported_files, +# match_by = "title", +# method = "exact" +# ) ``` @@ -92,52 +92,52 @@ the code below and then remove them by extracting unique references. Although here we only use one secondary deduplication method (string distance), we could look for additional duplicates based on fuzzy matching abstracts, for example. -*NOTE: the examples below don't match now; need updating* +## NOTE: the examples below don't match now; need updating ```{r} # there are still some duplicate articles that were not removed # for example, the titles for articles 91 and 114 appear identical -df$title[c(91,114)] +## df$title[c(91,114)] # the dash-like symbol in title 91, however, is a special character not punctuation # so it was not classified as identical # similarly, there is a missing space in the title for article 96 -df$title[c(21,96)] +## df$title[c(21,96)] # and an extra space in title 47 -df$title[c(47, 101)] - -# in this example, we will use string distance to identify likely duplicates -duplicates_string <- find_duplicates( - df$title, - method = "string_osa", - to_lower = TRUE, - rm_punctuation = TRUE, - threshold = 7 -) +## df$title[c(47, 101)] + +# # in this example, we will use string distance to identify likely duplicates +# duplicates_string <- find_duplicates( +# df$title, +# method = "string_osa", +# to_lower = TRUE, +# rm_punctuation = TRUE, +# threshold = 7 +# ) # we can extract the line numbers from the dataset that are likely duplicated # this lets us manually review those titles to confirm they are duplicates -manual_checks <- review_duplicates(df$title, duplicates_string) +# manual_checks <- review_duplicates(df$title, duplicates_string) ``` ```{r, include=FALSE, eval=TRUE} -manual_checks[,1] <- substring(manual_checks[,1], 1, 60) - -print(manual_checks[1:10, ]) +# manual_checks[,1] <- substring(manual_checks[,1], 1, 60) +# +# print(manual_checks[1:10, ]) ``` ```{r} -# the titles under match #99 are not duplicates, so we need to keep them both -# we can use the override_duplicates function to manually mark them as unique -new_duplicates <- synthesisr::override_duplicates(duplicates_string, 99) - -# now we can extract unique references from our dataset -# we need to pass it the dataset (df) and the matching articles (new_duplicates) -results <- extract_unique_references(df, new_duplicates) +# # the titles under match #99 are not duplicates, so we need to keep them both +# # we can use the override_duplicates function to manually mark them as unique +# new_duplicates <- synthesisr::override_duplicates(duplicates_string, 99) +# +# # now we can extract unique references from our dataset +# # we need to pass it the dataset (df) and the matching articles (new_duplicates) +# results <- extract_unique_references(df, new_duplicates) ``` @@ -149,17 +149,17 @@ Optionally, `write_refs()` can write directly to a text file stored locally. ```{r paged.print=TRUE} -# synthesisr can write the full dataset to a bibliographic file -# but in this example, we will just write the first citation -# we also want it to be a nice clean bibliographic file, so we remove NA data -# this makes it easier to view the output when working with a single article -citation <- df[1, !is.na(df[1,])] - -format_citation(citation) - -write_refs(citation, - format = "bib", - file = FALSE -) +# # synthesisr can write the full dataset to a bibliographic file +# # but in this example, we will just write the first citation +# # we also want it to be a nice clean bibliographic file, so we remove NA data +# # this makes it easier to view the output when working with a single article +# citation <- df[1, !is.na(df[1,])] +# +# format_citation(citation) +# +# write_refs(citation, +# format = "bib", +# file = FALSE +# ) ```