From e82fff497aba5d7abf627c2c14c347f92f5324fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= <csardi.gabor@gmail.com>
Date: Mon, 21 Oct 2024 12:32:54 +0200
Subject: [PATCH] Scan dependencies in Rmd files

---
 R/scan-deps-queries.R | 21 +++++++++++++++
 R/scan-deps.R         | 62 +++++++++++++++++++++++++++++++++++++++++--
 R/utils.R             |  5 ++++
 3 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/R/scan-deps-queries.R b/R/scan-deps-queries.R
index 28b1fb38..75ab0472 100644
--- a/R/scan-deps-queries.R
+++ b/R/scan-deps-queries.R
@@ -30,3 +30,24 @@ q_deps <- function() {
     NULL
   )
 }
+
+q_deps_rmd <- function() {
+  c(block =
+    '(fenced_code_block
+      (fenced_code_block_delimiter)
+      (info_string (language) @language) @header
+      (code_fence_content) @content
+      (#any-of? @language "r" "R" "rscript" "Rscript")
+      (#match? @header "^[{]")
+    )',
+    inline =
+      '(inline) @inline'
+  )
+}
+
+q_deps_rmd_inline <- function() {
+  '(code_span
+    (code_span_delimiter) @csd1
+    (code_span_delimiter) @csd2
+  ) @code'
+}
diff --git a/R/scan-deps.R b/R/scan-deps.R
index 9ac0730c..074bc02b 100644
--- a/R/scan-deps.R
+++ b/R/scan-deps.R
@@ -31,6 +31,8 @@ clear_deps_cache <- function() {
   unlink(dirname(get_deps_cache_path()), recursive = TRUE)
 }
 
+re_r_dep <- "library|require|loadNamespace|::"
+
 scan_path_deps <- function(path) {
   code <- readBin(path, "raw", file.size(path))
 
@@ -46,7 +48,7 @@ scan_path_deps <- function(path) {
   }
 
   # scan it if it is worth it, based on a quick check
-  has_deps <- length(grepRaw("library|require|loadNamespace|::", code)) > 0
+  has_deps <- length(grepRaw(re_r_dep, code)) > 0
   deps <- if (has_deps) scan_path_deps_do(code, path)
 
   # save it to the cache, but anonimize it first. If no deps, save NULL
@@ -150,5 +152,61 @@ parse_pkg_from_library_call <- function(fn, code) {
 # -------------------------------------------------------------------------
 
 scan_path_deps_do_rmd <- function(code, path) {
-  # TODO
+  hits <- code_query(code, language = "markdown", query = q_deps_rmd())
+  inl_pat <- hits$patterns$id[hits$patterns$name == "inline"]
+  inl_hits <- hits$matched_captures[
+    hits$matched_captures$pattern %in% inl_pat, ]
+  blk_hits <- hits$matched_captures[
+    ! hits$matched_captures$pattern %in% inl_pat, ]
+  rbind(
+    if (nrow(inl_hits)) scan_path_deps_do_inline_hits(code, inl_hits, path),
+    if (nrow(blk_hits)) scan_path_deps_do_block_hits(code, blk_hits, path)
+  )
+}
+
+range_cols <- c(
+  "start_row", "start_column", "end_row", "end_column",
+  "start_byte", "end_byte"
+)
+
+scan_path_deps_do_inline_hits <- function(code, inl_hits, path) {
+  wcnd <- which(inl_hits$name == "inline")
+  wcnd <- wcnd[grepl("`", inl_hits$code[wcnd], fixed = TRUE)]
+  wcnd <- wcnd[grepl(re_r_dep, inl_hits$code[wcnd])]
+  if (length(wcnd) == 0) {
+    return(NULL)
+  }
+
+  inl_ranges <- inl_hits[wcnd, range_cols]
+  r_hits <- code_query(
+    code,
+    language = "markdown-inline",
+    ranges = inl_ranges,
+    query = q_deps_rmd_inline()
+  )
+  cpt <- r_hits$matched_captures
+  pre_drop <- nchar(cpt$code[cpt$name == "csd1"])
+  post_drop <- nchar(cpt$code[cpt$name == "csd2"])
+  r_code <- omit_pre_post(cpt$code[cpt$name == "code"], pre_drop, post_drop)
+  wcnd2 <- substr(r_code, 1, 2) == "r " & grepl(re_r_dep, r_code)
+  if (!any(wcnd2)) {
+    return(NULL)
+  }
+  # need to adjust the ranges for the _ASCII_ (!) delimiters
+  r_ranges <- cpt[cpt$name == "code", ][wcnd2, range_cols]
+  r_ranges$start_byte <- r_ranges$start_byte + pre_drop[wcnd2] + 2L   # 'r '
+  r_ranges$start_column <- r_ranges$start_column + pre_drop[wcnd2] + 2L
+  r_ranges$end_byte <- r_ranges$end_byte - post_drop[wcnd2]
+  scan_path_deps_do_r(code, path = path, ranges = r_ranges)
+}
+
+scan_path_deps_do_block_hits <- function(code, blk_hits, path) {
+  wcnd <- which(blk_hits$name == "content")
+  wcnd <- wcnd[grepl(re_r_dep, blk_hits$code[wcnd])]
+  if (length(wcnd) == 0) {
+    return(NULL)
+  }
+
+  r_ranges <- blk_hits[wcnd, range_cols]
+  scan_path_deps_do_r(code, path = path, ranges = r_ranges)
 }
diff --git a/R/utils.R b/R/utils.R
index 3dd999e3..2a294f79 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -507,3 +507,8 @@ na_omit <- function(x) {
 file_ext <- function(x) {
   re_match(x, "[.]([[:alnum:]]+)$")[[".match"]]
 }
+
+# drop a prefix and a postfix, vectorized
+omit_pre_post <- function(x, pre = 0, post = 0) {
+  substr(x, 1L + pre, nchar(x) - post)
+}