diff --git a/R/get_data.R b/R/get_data.R index 17ebc38..7d3b8d9 100644 --- a/R/get_data.R +++ b/R/get_data.R @@ -2,10 +2,10 @@ #' #' @param dir What relative file path should the files be downloaded #' @param type Which OTTR repo are we downloading? Options are "rmd", "quarto", "rmd_website", "quarto_website" -#' +#' @param render Should the OTTR repo be rendered after downloading? Default is TRUE #' @return This downloads the main branch repo files from the respective repo for testing purposes #' @export -setup_ottr_template <- function(dir = "inst/extdata", type) { +setup_ottr_template <- function(dir = ".", type, render = TRUE) { if (!dir.exists(dir)) dir.create(dir, recursive = TRUE, showWarnings = FALSE) possible_types <- c("rmd", "quarto", "rmd_website", "quarto_website") @@ -45,6 +45,7 @@ setup_ottr_template <- function(dir = "inst/extdata", type) { } ## Render it + if (render) { if (type == "rmd") bookdown::render_book(output_dir) if (type == "rmd_website") rmarkdown::render_site(output_dir) @@ -58,6 +59,7 @@ setup_ottr_template <- function(dir = "inst/extdata", type) { as_job = FALSE ) } + } return(output_dir) } diff --git a/R/url-check.R b/R/url-check.R new file mode 100644 index 0000000..a78f625 --- /dev/null +++ b/R/url-check.R @@ -0,0 +1,224 @@ +#' Check URLs of all md,rmd, and qmd files +#' +#' @param path path to the bookdown or quarto course repository, must have a +#' `.github` folder which will be used to establish the top of the repo. +#' @param output_dir A relative file path to the folder (existing or not) that the +#' output check file should be saved to. Default is "check_reports" +#' @param resources_dir A relative file path to the folder (existing or not) that the +#' ignore_urls.txt file and exclude_files.txt will be found. Default is "resources". +#' If no ignore_urls.txt file and exclude_files.txt files are found, we will download one. +#' @param report_all Should all URLs that were tested be returned? Default is FALSE +#' meaning only broken URLs will be reported in the url_checks.tsv file. +#' @return A file will be saved that lists the broken URLs will be saved to the specified output_dir. +#' @export +#' +#' @importFrom magrittr +#' +#' @examples +#' +#' rmd_dir <- setup_ottr_template(dir = ".", type = "rmd", render = FALSE) +#' +#' check_urls(rmd_dir) +#' +#' # If there are broken URLs they will be printed in a list at "check_reports/url_checks.tsv" +#' +#' qmd_dir <- setup_ottr_template(dir = ".", type = "qmd", render = FALSE) +#' +#' check_urls(qmd_dir) +#' +check_urls <- function(path = ".", + output_dir = "check_reports", + resources_dir = "resources", + report_all = FALSE) { + # Find .git root directory + root_dir <- rprojroot::find_root(path = path, rprojroot::has_dir(".github")) + + resources_dir <- file.path(root_dir, resources_dir) + output_dir <- file.path(root_dir, output_dir) + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + } + if (!dir.exists(resources_dir)) { + dir.create(resources_dir, recursive = TRUE, showWarnings = FALSE) + } + + output_file <- file.path(output_dir, "url_checks.tsv") + ignore_urls_file <- file.path(resources_dir, "ignore-urls.txt") + exclude_file <- file.path(resources_dir, "exclude_files.txt") + + # Read in ignore urls file if it exists + if (file.exists(ignore_urls_file)) { + ignore_urls <- readLines(ignore_urls_file) + } else { + ignore_urls <- "" + } + + # Read in ignore urls file if it exists + if (file.exists(exclude_file)) { + exclude_file <- readLines(exclude_file) + } else { + exclude_file <- "" + } + + # Only declare `.md` files but not the ones in the style-sets directory + files <- list.files(path = root_dir, pattern = "md$", full.names = TRUE, recursive = TRUE) + + if (exclude_file[1] != "") files <- grep(paste0(exclude_file, collapse = "|"), files, invert = TRUE, value = TRUE) + + # Run this for all Rmds + all_urls <- lapply(files, get_urls) + + # Write the file + all_urls_df <- dplyr::bind_rows(all_urls) + + if (nrow(all_urls_df) > 0) { + if (!report_all) { + all_urls_df <- all_urls_df %>% + dplyr::filter(urls_status == "failed") %>% + readr::write_tsv(output_file) + } + } else { + all_urls_df <- data.frame(errors = NA) + } + + # Print out how many spell check errors + write(nrow(all_urls_df), stdout()) + + # Save spell errors to file temporarily + readr::write_tsv(all_urls_df, output_file) + + message(paste0("Saved to: ", output_file)) +} + + +#' Test a URL +#' +#' @param url A single URL that will be checked whether it is real. +#' @param ignore_url A vector of URLs which to ignore. +#' +#' @return a logical TRUE/FALSE for whether the URL is legitimate. +#' @export +#' +#' @importFrom magrittr +#' +test_url <- function(url, ignore_urls = "") { + + if (url %in% ignore_urls) { + message(paste0("Ignoring: ", url)) + return("ignored") + } + + message(paste0("Testing: ", url)) + + url_status <- try(httr::GET(url), silent = TRUE) + + # Fails if host can't be resolved + status <- ifelse(suppressMessages(grepl("Could not resolve host", url_status)), "failed", "success") + + if (status == "success") { + # Fails if 404'ed + status <- ifelse(try(url_status$status_code, silent = TRUE) == 404, "failed", "success") + } + + return(status) +} + + +#' Identify and collect URLs in a single rmd/qmd/md file +#' +#' @param file A file path to a rmd/qmd/md file that contains URLs to be check +#' @param ignore_url A vector of URLs which to ignore. +#' +#' @return a data.frame of all the URLs identified in the given rmd/qmd/md file +#' @export +#' +#' @importFrom magrittr +#' +get_urls <- function(file, ignore_urls = "") { + message(paste("##### Testing URLs from file:", file)) + + # Read in a file and return the urls from it + content <- readLines(file) + + # Set up the possible tags + html_tag <- " 0) { + url_list$html <- sapply(url_list$html, function(html_line) { + head(rvest::html_attr(rvest::html_nodes(rvest::read_html(html_line), "a"), "href")) + }) + url_list$html <- unlist(url_list$html) + } + url_list$knitr <- stringr::word(url_list$knitr, sep = "include_url\\(\"|\"\\)", 2) + url_list$ottrpal <- stringr::word(url_list$ottrpal, sep = "include_slide\\(\"|\"\\)", 2) + + # Check markdown for parentheticals outside of [ ]( ) + parens_index <- sapply(url_list$markdown, stringr::str_detect, nested_parens) + + if (length(parens_index) >= 1) { + # Break down to parenthetical only + url_list$markdown[parens_index] <- stringr::str_extract(url_list$markdown[parens_index], nested_parens) + # Remove parentheticals outside [ ]( ) + url_list$markdown[parens_index] <- stringr::word(stringr::str_replace(url_list$markdown[parens_index], outermost_parens, "\\1"), sep = "\\]", 2) + + url_list$markdown[!parens_index] <- stringr::word(url_list$markdown[!parens_index], sep = "\\]", 2) + url_list$markdown <- grep("http", url_list$markdown, value = TRUE) + } + if (length(url_list$markdown_bracket) > 0) { + url_list$markdown_bracket <- paste0("http", stringr::word(url_list$markdown_bracket, sep = "\\]: http", 2)) + } + url_list$other_http <- stringr::word(stringr::str_extract(url_list$other_http, url_pattern), sep = "\\]", 1) + + # Remove parentheses only if they are on the outside + url_list$other_http <- stringr::word(stringr::str_replace(url_list$other_http, outermost_parens, "\\1"), sep = "\\]", 1) + url_list$markdown <- stringr::word(stringr::str_replace(url_list$markdown, outermost_parens, "\\1"), sep = "\\]", 1) + + # Remove `< >` + url_list$other_http <- stringr::word(stringr::str_replace(url_list$other_http, "^<(.*)>(.*)$", "\\1"), sep = "\\]", 1) + + # If after the manipulations there's not actually a URL, remove it. + url_list <- lapply(url_list, na.omit) + + # collapse list + urls <- unlist(url_list) + + # Remove trailing characters + urls <- gsub("\\'\\:$|\\'|\\:$|\\.$|\\)$|\\,$", "", urls) + + # Remove URLs that are in the ignore + if (ignore_urls[1] != "") urls <- grep(paste0(ignore_urls, collapse = "|"), urls, invert = TRUE, value = TRUE) + + if (length(urls) > 0) { + # Remove trailing characters + urls_status <- sapply(urls, test_url, ignore_urls = ignore_urls) + url_df <- data.frame(urls, urls_status, file) + return(url_df) + } else { + message("No URLs found") + } +} diff --git a/tests/testthat/test-check-urls.R b/tests/testthat/test-check-urls.R new file mode 100644 index 0000000..d30d4c4 --- /dev/null +++ b/tests/testthat/test-check-urls.R @@ -0,0 +1,65 @@ + +output_file <- file.path("check_reports", "url_checks.tsv") + +test_that("Test URL checks for OTTR main", { + + rmd_dir <- setup_ottr_template(type = "rmd", render = FALSE) + + status <- check_urls(rmd_dir) + testthat::expect_true(status < 2) + + # Make sure the report exists + testthat::expect_true(file.exists(file.path(rmd_dir, output_file))) + results <- readr::read_tsv(file.path(rmd_dir, output_file)) + + # It should be a data.frame + testthat::expect_true(is.data.frame(results)) + clean_up() + +}) + +test_that("Test URL checks for OTTR Quarto main", { + qmd_dir <- setup_ottr_template(type = "quarto", render = FALSE) + + status <- check_urls(qmd_dir) + testthat::expect_true(status < 2) + + # Make sure the report exists + testthat::expect_true(file.exists(file.path(qmd_dir, output_file))) + results <- readr::read_tsv(file.path(qmd_dir, output_file)) + + # It should be a data.frame + testthat::expect_true(is.data.frame(results)) + clean_up() +}) + +test_that("Test URL checks for OTTR web", { + rmd_web <- setup_ottr_template(type = "rmd_website", render = FALSE) + + status <- check_urls(rmd_web, report_all = TRUE) + testthat::expect_true(status < 2) + + # Make sure the report exists + testthat::expect_true(file.exists(file.path(rmd_web, output_file))) + results <- readr::read_tsv(file.path(rmd_web, output_file)) + + # It should be a data.frame + testthat::expect_true(is.data.frame(results)) + clean_up() +}) + +test_that("Test URL checks for OTTR Quarto web", { + ## Test URL + qmd_web <- setup_ottr_template(type = "quarto_website", render = FALSE) + + status <- check_urls(qmd_web, report_all = TRUE) + testthat::expect_true(status < 2) + + # Make sure the report exists + testthat::expect_true(file.exists(file.path(qmd_web, output_file))) + results <- readr::read_tsv(file.path(qmd_web, output_file)) + + # It should be a data.frame + testthat::expect_true(is.data.frame(results)) + clean_up() +})