Clean up imports, particularly for bibtex files

- use `vroom` for imports to allow `locale` arg (instead of `Sys.setlocale()`, see #24 ) - refactor `parse_bibtex()` to use `unglue()` for > brevity and readability - add `as_tibble()`, for class `bibliography` - make `add_line_breaks()` backwards-compatible
mjwestgate · Jul 9, 2023 · 91d9796 · 91d9796
1 parent 7cda71c
commit 91d9796
Show file tree

Hide file tree

Showing 18 changed files with 266 additions and 214 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: synthesisr
 Type: Package
 Title: Import, Assemble, and Deduplicate Bibliographic Datasets
-Version: 0.3.0
+Version: 0.3.0.9999
 Authors@R: c(
   person(
     given = "Martin",
@@ -23,16 +23,20 @@ Description: A critical first step in systematic literature reviews
   <doi:10.1002/jrsm.1374> to import bibliographic data from a range of formats
   (such as 'bibtex', 'ris', or 'ciw') in a standard way, and allows merging
   and deduplication of the resulting dataset.
-Depends: R (>= 3.5.0)
+Depends: R (>= 4.0.0)
 Imports:
+    dplyr,
+    purrr,
     rlang,
     stringdist,
-    tibble
+    tibble,
+    unglue,
+    vroom
 Suggests: 
     knitr, 
     rmarkdown, 
     testthat
-Date: 2020-05-18
+Date: 2023-06-07
 License: GPL-3
 LazyData: true
 RoxygenNote: 7.2.3

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,11 +2,13 @@
 
 S3method("[",bibliography)
 S3method(as.data.frame,bibliography)
+S3method(as_tibble,bibliography)
 S3method(c,bibliography)
 S3method(print,bibliography)
 S3method(summary,bibliography)
 export(add_line_breaks)
 export(as.bibliography)
+export(as_tibble)
 export(clean_authors)
 export(clean_colnames)
 export(clean_df)
@@ -46,7 +48,12 @@ export(write_bib)
 export(write_refs)
 export(write_ris)
 importFrom(dplyr,bind_rows)
+importFrom(purrr,list_transpose)
 importFrom(rlang,abort)
 importFrom(rlang,warn)
 importFrom(stringdist,stringdist)
+importFrom(tibble,as_tibble)
 importFrom(tibble,tibble)
+importFrom(unglue,unglue_data)
+importFrom(vroom,default_locale)
+importFrom(vroom,vroom_lines)
diff --git a/R/add_line_breaks.R b/R/add_line_breaks.R
@@ -9,6 +9,11 @@
 #' @param n Numeric: The desired number of characters that should separate
 #' consecutive line breaks.
 #' @param html Logical: Should the line breaks be specified in html?
+#' @param max_n DEPRECATED: If provided will currently overwrite `n`; otherwise
+#' synonymous with `n` and will be removed from future versions.
+#' @param max_time DEPRECATED: Previously the maximum amount of time (in
+#' seconds) allowed to adjust groups until character thresholds are reached.
+#' Ignored.
 #' @details Line breaks are only added between words, so the value of n is
 #' actually a threshold value rather than being matched exactly.
 #' @return Returns the input vector unaltered except for the addition of line
@@ -18,8 +23,14 @@
 #' @export
 add_line_breaks <- function(x,
                             n = 50,
-                            html = FALSE
+                            max_n = NULL,
+                            html = FALSE,
+                            max_time = NULL
                             ){
+  if(!is.null(max_n)){
+    n <- max_n
+  }
+
   if(html){
     break_string <- "<br>"
   }else{

diff --git a/R/bibliography_functions.R b/R/bibliography_functions.R
@@ -183,3 +183,19 @@ as.bibliography <- function(x, ...){
   class(x_list) <- "bibliography"
   return(x_list)
 }
+
+#' @rdname bibliography-class
+#' @param .rows currently ignored
+#' @param .name_repair currently ignored
+#' @param rownames currently ignored
+#' @importFrom purrr list_transpose
+#' @importFrom tibble as_tibble
+#' @export
+as_tibble.bibliography <- function(x,
+                                   ...,
+                                   .rows,
+                                   .name_repair,
+                                   rownames){
+  class(x) <- "list"
+  as_tibble(list_transpose(x))
+}
diff --git a/R/deduplication_functions.R b/R/deduplication_functions.R
@@ -36,7 +36,7 @@ find_duplicates <- function(
 ){
   # data
   if(missing(data)){
-    abort("'data' is missing: Please provide a data.frame")
+    abort("'data' is missing: Please provide a vector")
   }
   if(inherits(data, "data.frame")){
     abort("'data' must be a character vector, not a data.frame")
@@ -81,6 +81,7 @@ find_duplicates <- function(
     data[is.na(data)] <- paste0("MISSING_VALUE_", seq_along(which(is.na(data))))
     # split data by name
     order_list <- split(order_initial, data)
+    names(order_list) <- NULL
     order_list <- order_list[order(unlist(lapply(order_list, min)))]
     result <- do.call(c, lapply(
       seq_along(order_list),
@@ -297,7 +298,8 @@ deduplicate <- function(
     }
   }
 
-  result <- find_duplicates(data_fd, method = method, ...)
+  result <- find_duplicates(as.character(data_fd),
+                            method = method, ...)
   return(
     extract_unique_references(data, matches = result, type = type)
   )

diff --git a/R/detect_functions.R b/R/detect_functions.R
@@ -40,7 +40,7 @@ detect_parser <- function(x){
       "tab" = "parse_tsv",
       "bibtex" = "parse_bibtex",
       "ris" = {
-        if(length(which(grepl("$PMID", x))) > 0){
+        if(length(which(grepl("PMID", x))) > 0){
           "parse_pubmed"
         }else{
           "parse_ris"

diff --git a/R/parse_functions.R b/R/parse_functions.R
@@ -3,7 +3,9 @@
 #' @description Text in standard formats - such as imported via
 #' `base::readLines()` - can be parsed using a variety of standard formats. Use
 #' `detect_parser()` to determine which is the most appropriate parser for your
-#' situation.
+#' situation. Note that `parse_tsv()` and `parse_csv()` are maintained for
+#' backwards compatability only; within `read_ref` these have been replaced
+#' by `vroom::vroom()`.
 #' @param x A character vector containing bibliographic information in ris
 #' format.
 #' @return Returns an object of class `bibliography` (ris, bib, or pubmed
@@ -290,157 +292,78 @@ parse_ris <- function(x, tag_naming = "best_guess"){
 
 
 #' @rdname parse_
+#' @importFrom dplyr bind_rows
+#' @importFrom tibble tibble
+#' @importFrom unglue unglue_data
 #' @export
 parse_bibtex <- function(x){
-
-  ### Remove lines that start with a percentage symbol (comments)
-  x <- grep("^\\s*%.*",
-            x,
-            invert = TRUE,
-            value=TRUE)
-
-  # which lines start with @article?
-  group_vec <- rep(0, length(x))
-  row_id <- which(regexpr("^@", x) == 1)
-  group_vec[row_id] <- 1
-  group_vec <- cumsum(group_vec)
-
-  # work out row names
-  ref_names <- gsub(".*\\{|,$", "", x[row_id])
-  ref_type <- gsub(".*@|\\{.*", "", x[row_id])
-
-  # split by reference
-  x_split <- split(x[-row_id], group_vec[-row_id])
-  length_vals <- unlist(lapply(x_split, length))
-  x_split <- x_split[which(length_vals > 3)]
-
-  x_final <- lapply(x_split, function(z){
-
-    # first use a stringent lookup term to locate only tagged rows
-    delimiter_lookup <- regexpr(
-      "^[[:blank:]]*([[:alnum:]]|[[:punct:]])+[[:blank:]]*=[[:blank:]]*\\{+",
-      z
-    )
-    delimiter_rows <- which(delimiter_lookup != -1)
-    other_rows <- which(delimiter_lookup == -1)
-    delimiters <- data.frame(
-      row = delimiter_rows,
-      location = regexpr("=", z[delimiter_rows])
-    )
-    split_tags <- apply(delimiters, 1, function(a, lookup){
-      c(
-        row = as.numeric(a[1]),
-        tag = substr(
-          x = lookup[a[1]],
-          start = 1,
-          stop = a[2] - 1
-        ),
-        value = substr(
-          x = lookup[a[1]],
-          start = a[2] + 1,
-          stop = nchar(lookup[a[1]])
-        )
-      )
-    },
-    lookup = z
-    )
-    entry_dframe <- rbind(
-      as.data.frame(
-        t(split_tags),
-        stringsAsFactors = FALSE
-      ),
-      data.frame(
-        row = other_rows,
-        tag = NA,
-        value = z[other_rows],
-        stringsAsFactors = FALSE
-      )
-    )
-    entry_dframe$row <- as.numeric(entry_dframe$row)
-    entry_dframe <- entry_dframe[order(entry_dframe$row), c("tag", "value")]
-
-    if(any(entry_dframe$value == "}")){
-      entry_dframe <- entry_dframe[seq_len(which(entry_dframe$value == "}")[1]-1), ]
+  # use `unglue` to parse text
+  raw_df <- unglue_data(x,
+              patterns = c("[variable]={[value]},",
+                           "@[variable]{[value],"),
+              open = "[",
+              close = "]")
+
+  # remove missing values
+  raw_df <- raw_df[!(is.na(raw_df$variable) | is.na(raw_df$value)), ]
+
+  # create a vector assigning rows to articles
+  article_vec <- as.integer(raw_df$variable == "ARTICLE")
+  article_vec[is.na(article_vec)] <- 0
+  raw_df$article <- cumsum(article_vec)
+
+  # split by article and transpose
+  result <- lapply(
+    split(raw_df[, 1:2], raw_df$article),
+    function(a){
+      result <- as.data.frame(t(a$value))
+      colnames(result) <- a$variable
+      return(result)
+    }) |>
+    bind_rows() |>
+    tibble()
+
+  # split authors
+  if(any(names(result) == "author")){
+    if(any(grepl("and", result$author))){
+      result$author <- strsplit(result$author, "\\s*and\\s*")
     }
-    if(any(entry_dframe$value == "")){
-      entry_dframe <- entry_dframe[-which(entry_dframe$value == ""), ]
-    }
-
-    # remove whitespace
-    entry_dframe <- as.data.frame(
-      lapply(entry_dframe, trimws),
-      stringsAsFactors = FALSE
-    )
-    # remove 1 or more opening brackets
-    entry_dframe$value <- gsub("^\\{+", "", entry_dframe$value)
-    # remove 1 or more closing brackets followed by zero or more punctuation marks
-    entry_dframe$value <- gsub("\\}+[[:punct:]]*$", "", entry_dframe$value)
-
-    # convert each entry to a list
-    label_group <- rep(0, nrow(entry_dframe))
-    tag_rows <- which(entry_dframe$tag != "")
-    label_group[tag_rows] <- 1
-    tag_names <- entry_dframe$tag[tag_rows]
-    entry_list <- split(
-      entry_dframe$value,
-      cumsum(label_group)+1
-    )
-    names(entry_list) <- tolower(
-      gsub("^\\s+|\\s+$",  "", tag_names)
-    )
-    entry_list <- lapply(entry_list,
-                         function(a){paste(a, collapse = " ")}
-    )
-    if(any(names(entry_list) == "author")){
-      if(length(entry_list$author) == 1){
-        entry_list$author <- strsplit(entry_list$author, " and ")[[1]]
-      }
-    }
-    return(entry_list)
-  })
-
-  # add type
-  x_final <- lapply(
-    seq_len(length(x_final)),
-    function(a, type, data){
-      c(type = type[a], data[[a]])
-    },
-    type = ref_type,
-    data = x_final
-  )
+  }
 
-  names(x_final) <- ref_names
-  class(x_final) <- "bibliography"
-  return(x_final)
+  # join duplicated columns
+  # note: needs to be done simultaneously with calling `tibble()`
 
+  return(result)
 }
 
 #' @rdname parse_
 #' @export
 parse_csv <- function(x){
-  z <- read.table(
+  read.table(
     text = x,
     header = TRUE,
     sep = ",",
     quote = "\"",
     dec = ".",
     fill = TRUE,
-    stringsAsFactors = FALSE, row.names = NULL
-  )
-  return(match_columns(z))
+    stringsAsFactors = FALSE,
+    row.names = NULL) |>
+  match_columns() |>
+  tibble()
 }
 
 #' @rdname parse_
 #' @export
 parse_tsv <- function(x){
-  z <- read.table(
+  read.table(
     text = x,
     header = TRUE,
     sep = "\t",
     quote = "\"",
     dec = ".",
     fill = TRUE,
-    stringsAsFactors = FALSE, row.names = NULL
-  )
-  return(match_columns(z))
+    stringsAsFactors = FALSE,
+    row.names = NULL) |>
+  match_columns() |>
+  tibble()
 }