Skip to content

Commit

Permalink
Clean up imports, particularly for bibtex files
Browse files Browse the repository at this point in the history
- use `vroom` for imports to allow `locale` arg (instead of `Sys.setlocale()`, see #24 )
- refactor `parse_bibtex()` to use `unglue()` for > brevity and readability
- add `as_tibble()`, for class `bibliography`
- make `add_line_breaks()` backwards-compatible
  • Loading branch information
mjwestgate committed Jul 9, 2023
1 parent 7cda71c commit 91d9796
Show file tree
Hide file tree
Showing 18 changed files with 266 additions and 214 deletions.
12 changes: 8 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: synthesisr
Type: Package
Title: Import, Assemble, and Deduplicate Bibliographic Datasets
Version: 0.3.0
Version: 0.3.0.9999
Authors@R: c(
person(
given = "Martin",
Expand All @@ -23,16 +23,20 @@ Description: A critical first step in systematic literature reviews
<doi:10.1002/jrsm.1374> to import bibliographic data from a range of formats
(such as 'bibtex', 'ris', or 'ciw') in a standard way, and allows merging
and deduplication of the resulting dataset.
Depends: R (>= 3.5.0)
Depends: R (>= 4.0.0)
Imports:
dplyr,
purrr,
rlang,
stringdist,
tibble
tibble,
unglue,
vroom
Suggests:
knitr,
rmarkdown,
testthat
Date: 2020-05-18
Date: 2023-06-07
License: GPL-3
LazyData: true
RoxygenNote: 7.2.3
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

S3method("[",bibliography)
S3method(as.data.frame,bibliography)
S3method(as_tibble,bibliography)
S3method(c,bibliography)
S3method(print,bibliography)
S3method(summary,bibliography)
export(add_line_breaks)
export(as.bibliography)
export(as_tibble)
export(clean_authors)
export(clean_colnames)
export(clean_df)
Expand Down Expand Up @@ -46,7 +48,12 @@ export(write_bib)
export(write_refs)
export(write_ris)
importFrom(dplyr,bind_rows)
importFrom(purrr,list_transpose)
importFrom(rlang,abort)
importFrom(rlang,warn)
importFrom(stringdist,stringdist)
importFrom(tibble,as_tibble)
importFrom(tibble,tibble)
importFrom(unglue,unglue_data)
importFrom(vroom,default_locale)
importFrom(vroom,vroom_lines)
13 changes: 12 additions & 1 deletion R/add_line_breaks.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
#' @param n Numeric: The desired number of characters that should separate
#' consecutive line breaks.
#' @param html Logical: Should the line breaks be specified in html?
#' @param max_n DEPRECATED: If provided will currently overwrite `n`; otherwise
#' synonymous with `n` and will be removed from future versions.
#' @param max_time DEPRECATED: Previously the maximum amount of time (in
#' seconds) allowed to adjust groups until character thresholds are reached.
#' Ignored.
#' @details Line breaks are only added between words, so the value of n is
#' actually a threshold value rather than being matched exactly.
#' @return Returns the input vector unaltered except for the addition of line
Expand All @@ -18,8 +23,14 @@
#' @export
add_line_breaks <- function(x,
n = 50,
html = FALSE
max_n = NULL,
html = FALSE,
max_time = NULL
){
if(!is.null(max_n)){
n <- max_n
}

if(html){
break_string <- "<br>"
}else{
Expand Down
16 changes: 16 additions & 0 deletions R/bibliography_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,19 @@ as.bibliography <- function(x, ...){
class(x_list) <- "bibliography"
return(x_list)
}

#' @rdname bibliography-class
#' @param .rows currently ignored
#' @param .name_repair currently ignored
#' @param rownames currently ignored
#' @importFrom purrr list_transpose
#' @importFrom tibble as_tibble
#' @export
as_tibble.bibliography <- function(x,
...,
.rows,
.name_repair,
rownames){
class(x) <- "list"
as_tibble(list_transpose(x))
}
6 changes: 4 additions & 2 deletions R/deduplication_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ find_duplicates <- function(
){
# data
if(missing(data)){
abort("'data' is missing: Please provide a data.frame")
abort("'data' is missing: Please provide a vector")
}
if(inherits(data, "data.frame")){
abort("'data' must be a character vector, not a data.frame")
Expand Down Expand Up @@ -81,6 +81,7 @@ find_duplicates <- function(
data[is.na(data)] <- paste0("MISSING_VALUE_", seq_along(which(is.na(data))))
# split data by name
order_list <- split(order_initial, data)
names(order_list) <- NULL
order_list <- order_list[order(unlist(lapply(order_list, min)))]
result <- do.call(c, lapply(
seq_along(order_list),
Expand Down Expand Up @@ -297,7 +298,8 @@ deduplicate <- function(
}
}

result <- find_duplicates(data_fd, method = method, ...)
result <- find_duplicates(as.character(data_fd),
method = method, ...)
return(
extract_unique_references(data, matches = result, type = type)
)
Expand Down
2 changes: 1 addition & 1 deletion R/detect_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ detect_parser <- function(x){
"tab" = "parse_tsv",
"bibtex" = "parse_bibtex",
"ris" = {
if(length(which(grepl("$PMID", x))) > 0){
if(length(which(grepl("PMID", x))) > 0){
"parse_pubmed"
}else{
"parse_ris"
Expand Down
177 changes: 50 additions & 127 deletions R/parse_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#' @description Text in standard formats - such as imported via
#' `base::readLines()` - can be parsed using a variety of standard formats. Use
#' `detect_parser()` to determine which is the most appropriate parser for your
#' situation.
#' situation. Note that `parse_tsv()` and `parse_csv()` are maintained for
#' backwards compatability only; within `read_ref` these have been replaced
#' by `vroom::vroom()`.
#' @param x A character vector containing bibliographic information in ris
#' format.
#' @return Returns an object of class `bibliography` (ris, bib, or pubmed
Expand Down Expand Up @@ -290,157 +292,78 @@ parse_ris <- function(x, tag_naming = "best_guess"){


#' @rdname parse_
#' @importFrom dplyr bind_rows
#' @importFrom tibble tibble
#' @importFrom unglue unglue_data
#' @export
parse_bibtex <- function(x){

### Remove lines that start with a percentage symbol (comments)
x <- grep("^\\s*%.*",
x,
invert = TRUE,
value=TRUE)

# which lines start with @article?
group_vec <- rep(0, length(x))
row_id <- which(regexpr("^@", x) == 1)
group_vec[row_id] <- 1
group_vec <- cumsum(group_vec)

# work out row names
ref_names <- gsub(".*\\{|,$", "", x[row_id])
ref_type <- gsub(".*@|\\{.*", "", x[row_id])

# split by reference
x_split <- split(x[-row_id], group_vec[-row_id])
length_vals <- unlist(lapply(x_split, length))
x_split <- x_split[which(length_vals > 3)]

x_final <- lapply(x_split, function(z){

# first use a stringent lookup term to locate only tagged rows
delimiter_lookup <- regexpr(
"^[[:blank:]]*([[:alnum:]]|[[:punct:]])+[[:blank:]]*=[[:blank:]]*\\{+",
z
)
delimiter_rows <- which(delimiter_lookup != -1)
other_rows <- which(delimiter_lookup == -1)
delimiters <- data.frame(
row = delimiter_rows,
location = regexpr("=", z[delimiter_rows])
)
split_tags <- apply(delimiters, 1, function(a, lookup){
c(
row = as.numeric(a[1]),
tag = substr(
x = lookup[a[1]],
start = 1,
stop = a[2] - 1
),
value = substr(
x = lookup[a[1]],
start = a[2] + 1,
stop = nchar(lookup[a[1]])
)
)
},
lookup = z
)
entry_dframe <- rbind(
as.data.frame(
t(split_tags),
stringsAsFactors = FALSE
),
data.frame(
row = other_rows,
tag = NA,
value = z[other_rows],
stringsAsFactors = FALSE
)
)
entry_dframe$row <- as.numeric(entry_dframe$row)
entry_dframe <- entry_dframe[order(entry_dframe$row), c("tag", "value")]

if(any(entry_dframe$value == "}")){
entry_dframe <- entry_dframe[seq_len(which(entry_dframe$value == "}")[1]-1), ]
# use `unglue` to parse text
raw_df <- unglue_data(x,
patterns = c("[variable]={[value]},",
"@[variable]{[value],"),
open = "[",
close = "]")

# remove missing values
raw_df <- raw_df[!(is.na(raw_df$variable) | is.na(raw_df$value)), ]

# create a vector assigning rows to articles
article_vec <- as.integer(raw_df$variable == "ARTICLE")
article_vec[is.na(article_vec)] <- 0
raw_df$article <- cumsum(article_vec)

# split by article and transpose
result <- lapply(
split(raw_df[, 1:2], raw_df$article),
function(a){
result <- as.data.frame(t(a$value))
colnames(result) <- a$variable
return(result)
}) |>
bind_rows() |>
tibble()

# split authors
if(any(names(result) == "author")){
if(any(grepl("and", result$author))){
result$author <- strsplit(result$author, "\\s*and\\s*")
}
if(any(entry_dframe$value == "")){
entry_dframe <- entry_dframe[-which(entry_dframe$value == ""), ]
}

# remove whitespace
entry_dframe <- as.data.frame(
lapply(entry_dframe, trimws),
stringsAsFactors = FALSE
)
# remove 1 or more opening brackets
entry_dframe$value <- gsub("^\\{+", "", entry_dframe$value)
# remove 1 or more closing brackets followed by zero or more punctuation marks
entry_dframe$value <- gsub("\\}+[[:punct:]]*$", "", entry_dframe$value)

# convert each entry to a list
label_group <- rep(0, nrow(entry_dframe))
tag_rows <- which(entry_dframe$tag != "")
label_group[tag_rows] <- 1
tag_names <- entry_dframe$tag[tag_rows]
entry_list <- split(
entry_dframe$value,
cumsum(label_group)+1
)
names(entry_list) <- tolower(
gsub("^\\s+|\\s+$", "", tag_names)
)
entry_list <- lapply(entry_list,
function(a){paste(a, collapse = " ")}
)
if(any(names(entry_list) == "author")){
if(length(entry_list$author) == 1){
entry_list$author <- strsplit(entry_list$author, " and ")[[1]]
}
}
return(entry_list)
})

# add type
x_final <- lapply(
seq_len(length(x_final)),
function(a, type, data){
c(type = type[a], data[[a]])
},
type = ref_type,
data = x_final
)
}

names(x_final) <- ref_names
class(x_final) <- "bibliography"
return(x_final)
# join duplicated columns
# note: needs to be done simultaneously with calling `tibble()`

return(result)
}

#' @rdname parse_
#' @export
parse_csv <- function(x){
z <- read.table(
read.table(
text = x,
header = TRUE,
sep = ",",
quote = "\"",
dec = ".",
fill = TRUE,
stringsAsFactors = FALSE, row.names = NULL
)
return(match_columns(z))
stringsAsFactors = FALSE,
row.names = NULL) |>
match_columns() |>
tibble()
}

#' @rdname parse_
#' @export
parse_tsv <- function(x){
z <- read.table(
read.table(
text = x,
header = TRUE,
sep = "\t",
quote = "\"",
dec = ".",
fill = TRUE,
stringsAsFactors = FALSE, row.names = NULL
)
return(match_columns(z))
stringsAsFactors = FALSE,
row.names = NULL) |>
match_columns() |>
tibble()
}
Loading

0 comments on commit 91d9796

Please sign in to comment.