Skip to content

Commit

Permalink
Merge pull request #160 from jhudsl/cansavvy/check-urls
Browse files Browse the repository at this point in the history
Functionalize URL checker
  • Loading branch information
cansavvy authored Dec 20, 2024
2 parents 609a841 + 1953e53 commit df7ffcb
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 2 deletions.
6 changes: 4 additions & 2 deletions R/get_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
#'
#' @param dir What relative file path should the files be downloaded
#' @param type Which OTTR repo are we downloading? Options are "rmd", "quarto", "rmd_website", "quarto_website"
#'
#' @param render Should the OTTR repo be rendered after downloading? Default is TRUE
#' @return This downloads the main branch repo files from the respective repo for testing purposes
#' @export
setup_ottr_template <- function(dir = "inst/extdata", type) {
setup_ottr_template <- function(dir = ".", type, render = TRUE) {
if (!dir.exists(dir)) dir.create(dir, recursive = TRUE, showWarnings = FALSE)

possible_types <- c("rmd", "quarto", "rmd_website", "quarto_website")
Expand Down Expand Up @@ -45,6 +45,7 @@ setup_ottr_template <- function(dir = "inst/extdata", type) {
}

## Render it
if (render) {
if (type == "rmd") bookdown::render_book(output_dir)
if (type == "rmd_website") rmarkdown::render_site(output_dir)

Expand All @@ -58,6 +59,7 @@ setup_ottr_template <- function(dir = "inst/extdata", type) {
as_job = FALSE
)
}
}
return(output_dir)
}

Expand Down
224 changes: 224 additions & 0 deletions R/url-check.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#' Check URLs of all md,rmd, and qmd files
#'
#' @param path path to the bookdown or quarto course repository, must have a
#' `.github` folder which will be used to establish the top of the repo.
#' @param output_dir A relative file path to the folder (existing or not) that the
#' output check file should be saved to. Default is "check_reports"
#' @param resources_dir A relative file path to the folder (existing or not) that the
#' ignore_urls.txt file and exclude_files.txt will be found. Default is "resources".
#' If no ignore_urls.txt file and exclude_files.txt files are found, we will download one.
#' @param report_all Should all URLs that were tested be returned? Default is FALSE
#' meaning only broken URLs will be reported in the url_checks.tsv file.
#' @return A file will be saved that lists the broken URLs will be saved to the specified output_dir.
#' @export
#'
#' @importFrom magrittr
#'
#' @examples
#'
#' rmd_dir <- setup_ottr_template(dir = ".", type = "rmd", render = FALSE)
#'
#' check_urls(rmd_dir)
#'
#' # If there are broken URLs they will be printed in a list at "check_reports/url_checks.tsv"
#'
#' qmd_dir <- setup_ottr_template(dir = ".", type = "qmd", render = FALSE)
#'
#' check_urls(qmd_dir)
#'
check_urls <- function(path = ".",
output_dir = "check_reports",
resources_dir = "resources",
report_all = FALSE) {
# Find .git root directory
root_dir <- rprojroot::find_root(path = path, rprojroot::has_dir(".github"))

resources_dir <- file.path(root_dir, resources_dir)
output_dir <- file.path(root_dir, output_dir)

if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
if (!dir.exists(resources_dir)) {
dir.create(resources_dir, recursive = TRUE, showWarnings = FALSE)
}

output_file <- file.path(output_dir, "url_checks.tsv")
ignore_urls_file <- file.path(resources_dir, "ignore-urls.txt")
exclude_file <- file.path(resources_dir, "exclude_files.txt")

# Read in ignore urls file if it exists
if (file.exists(ignore_urls_file)) {
ignore_urls <- readLines(ignore_urls_file)
} else {
ignore_urls <- ""
}

# Read in ignore urls file if it exists
if (file.exists(exclude_file)) {
exclude_file <- readLines(exclude_file)
} else {
exclude_file <- ""
}

# Only declare `.md` files but not the ones in the style-sets directory
files <- list.files(path = root_dir, pattern = "md$", full.names = TRUE, recursive = TRUE)

if (exclude_file[1] != "") files <- grep(paste0(exclude_file, collapse = "|"), files, invert = TRUE, value = TRUE)

# Run this for all Rmds
all_urls <- lapply(files, get_urls)

# Write the file
all_urls_df <- dplyr::bind_rows(all_urls)

if (nrow(all_urls_df) > 0) {
if (!report_all) {
all_urls_df <- all_urls_df %>%
dplyr::filter(urls_status == "failed") %>%
readr::write_tsv(output_file)
}
} else {
all_urls_df <- data.frame(errors = NA)
}

# Print out how many spell check errors
write(nrow(all_urls_df), stdout())

# Save spell errors to file temporarily
readr::write_tsv(all_urls_df, output_file)

message(paste0("Saved to: ", output_file))
}


#' Test a URL
#'
#' @param url A single URL that will be checked whether it is real.
#' @param ignore_url A vector of URLs which to ignore.
#'
#' @return a logical TRUE/FALSE for whether the URL is legitimate.
#' @export
#'
#' @importFrom magrittr
#'
test_url <- function(url, ignore_urls = "") {

if (url %in% ignore_urls) {
message(paste0("Ignoring: ", url))
return("ignored")
}

message(paste0("Testing: ", url))

url_status <- try(httr::GET(url), silent = TRUE)

# Fails if host can't be resolved
status <- ifelse(suppressMessages(grepl("Could not resolve host", url_status)), "failed", "success")

if (status == "success") {
# Fails if 404'ed
status <- ifelse(try(url_status$status_code, silent = TRUE) == 404, "failed", "success")
}

return(status)
}


#' Identify and collect URLs in a single rmd/qmd/md file
#'
#' @param file A file path to a rmd/qmd/md file that contains URLs to be check
#' @param ignore_url A vector of URLs which to ignore.
#'
#' @return a data.frame of all the URLs identified in the given rmd/qmd/md file
#' @export
#'
#' @importFrom magrittr
#'
get_urls <- function(file, ignore_urls = "") {
message(paste("##### Testing URLs from file:", file))

# Read in a file and return the urls from it
content <- readLines(file)

# Set up the possible tags
html_tag <- "<a href="
include_url_tag <- "include_url\\("
include_slide_tag <- "include_slide\\("
markdown_tag <- "\\[.*\\]\\(http[s]?.*\\)"
markdown_tag_bracket <- "\\[.*\\]: http[s]?"
http_gen <- "http[s]?"
url_pattern <- "[(|<]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

# Other patterns
nested_parens <- "\\((.*)\\((.*)\\)(.*)\\)"
outermost_parens <- "^\\((.*)\\)(.*)$"

# Collect the different kinds of tags in a named vector
all_tags <- c(
html = html_tag,
knitr = include_url_tag,
ottrpal = include_slide_tag,
markdown = markdown_tag,
markdown_bracket = markdown_tag_bracket,
other_http = http_gen
)

url_list <- sapply(all_tags, grep, content, value = TRUE)
url_list$other_http <- setdiff(url_list$other_http, unlist(url_list[-6]))

# Extract the urls only of each type
if (length(url_list$html) > 0) {
url_list$html <- sapply(url_list$html, function(html_line) {
head(rvest::html_attr(rvest::html_nodes(rvest::read_html(html_line), "a"), "href"))
})
url_list$html <- unlist(url_list$html)
}
url_list$knitr <- stringr::word(url_list$knitr, sep = "include_url\\(\"|\"\\)", 2)
url_list$ottrpal <- stringr::word(url_list$ottrpal, sep = "include_slide\\(\"|\"\\)", 2)

# Check markdown for parentheticals outside of [ ]( )
parens_index <- sapply(url_list$markdown, stringr::str_detect, nested_parens)

if (length(parens_index) >= 1) {
# Break down to parenthetical only
url_list$markdown[parens_index] <- stringr::str_extract(url_list$markdown[parens_index], nested_parens)
# Remove parentheticals outside [ ]( )
url_list$markdown[parens_index] <- stringr::word(stringr::str_replace(url_list$markdown[parens_index], outermost_parens, "\\1"), sep = "\\]", 2)

url_list$markdown[!parens_index] <- stringr::word(url_list$markdown[!parens_index], sep = "\\]", 2)
url_list$markdown <- grep("http", url_list$markdown, value = TRUE)
}
if (length(url_list$markdown_bracket) > 0) {
url_list$markdown_bracket <- paste0("http", stringr::word(url_list$markdown_bracket, sep = "\\]: http", 2))
}
url_list$other_http <- stringr::word(stringr::str_extract(url_list$other_http, url_pattern), sep = "\\]", 1)

# Remove parentheses only if they are on the outside
url_list$other_http <- stringr::word(stringr::str_replace(url_list$other_http, outermost_parens, "\\1"), sep = "\\]", 1)
url_list$markdown <- stringr::word(stringr::str_replace(url_list$markdown, outermost_parens, "\\1"), sep = "\\]", 1)

# Remove `< >`
url_list$other_http <- stringr::word(stringr::str_replace(url_list$other_http, "^<(.*)>(.*)$", "\\1"), sep = "\\]", 1)

# If after the manipulations there's not actually a URL, remove it.
url_list <- lapply(url_list, na.omit)

# collapse list
urls <- unlist(url_list)

# Remove trailing characters
urls <- gsub("\\'\\:$|\\'|\\:$|\\.$|\\)$|\\,$", "", urls)

# Remove URLs that are in the ignore
if (ignore_urls[1] != "") urls <- grep(paste0(ignore_urls, collapse = "|"), urls, invert = TRUE, value = TRUE)

if (length(urls) > 0) {
# Remove trailing characters
urls_status <- sapply(urls, test_url, ignore_urls = ignore_urls)
url_df <- data.frame(urls, urls_status, file)
return(url_df)
} else {
message("No URLs found")
}
}
65 changes: 65 additions & 0 deletions tests/testthat/test-check-urls.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@

output_file <- file.path("check_reports", "url_checks.tsv")

test_that("Test URL checks for OTTR main", {

rmd_dir <- setup_ottr_template(type = "rmd", render = FALSE)

status <- check_urls(rmd_dir)
testthat::expect_true(status < 2)

# Make sure the report exists
testthat::expect_true(file.exists(file.path(rmd_dir, output_file)))
results <- readr::read_tsv(file.path(rmd_dir, output_file))

# It should be a data.frame
testthat::expect_true(is.data.frame(results))
clean_up()

})

test_that("Test URL checks for OTTR Quarto main", {
qmd_dir <- setup_ottr_template(type = "quarto", render = FALSE)

status <- check_urls(qmd_dir)
testthat::expect_true(status < 2)

# Make sure the report exists
testthat::expect_true(file.exists(file.path(qmd_dir, output_file)))
results <- readr::read_tsv(file.path(qmd_dir, output_file))

# It should be a data.frame
testthat::expect_true(is.data.frame(results))
clean_up()
})

test_that("Test URL checks for OTTR web", {
rmd_web <- setup_ottr_template(type = "rmd_website", render = FALSE)

status <- check_urls(rmd_web, report_all = TRUE)
testthat::expect_true(status < 2)

# Make sure the report exists
testthat::expect_true(file.exists(file.path(rmd_web, output_file)))
results <- readr::read_tsv(file.path(rmd_web, output_file))

# It should be a data.frame
testthat::expect_true(is.data.frame(results))
clean_up()
})

test_that("Test URL checks for OTTR Quarto web", {
## Test URL
qmd_web <- setup_ottr_template(type = "quarto_website", render = FALSE)

status <- check_urls(qmd_web, report_all = TRUE)
testthat::expect_true(status < 2)

# Make sure the report exists
testthat::expect_true(file.exists(file.path(qmd_web, output_file)))
results <- readr::read_tsv(file.path(qmd_web, output_file))

# It should be a data.frame
testthat::expect_true(is.data.frame(results))
clean_up()
})

0 comments on commit df7ffcb

Please sign in to comment.