From 66155e06a054d1924d31cfd6a5c3f29ffaf77404 Mon Sep 17 00:00:00 2001 From: James Laird-Smith Date: Sun, 15 Oct 2023 19:41:20 +0100 Subject: [PATCH 1/4] Git fetch now robust to tree order As it stands, code for unpacking of git repo packfiles is sensitive to the order of the trees. This doesn't matter for servers that provide it in the order of deepest roots first, but for those that don't do that, unpacking will provide incorrect results. This problem is the case for repos hosted on Azure DevOps, for example. This change uses the frequency of the hashes throught the packfile to sort the trees from deepest (the root tree) to least deep. This is the most straightforward way I could think of for achieving this. Practically, the function for unpacking the packfile (`unpack_packfile_repo()`) now includes a call to a new function `sort_trees()`. --- R/git-protocol.R | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/R/git-protocol.R b/R/git-protocol.R index 1efa5e49..e9e0d4df 100644 --- a/R/git-protocol.R +++ b/R/git-protocol.R @@ -504,9 +504,32 @@ async_git_download_repo_sha <- function(url, sha, output) { then(function(packfile) unpack_packfile_repo(packfile, output)) } +filter_hashes <- function(leaf) { + out <- list() + obs <- leaf$object + obs <- obs[obs$type == "tree", "hash"] + out[["hashes"]] <- obs + out$hash <- leaf$hash + out +} + +sort_trees <- function(trees){ + hashes_tree <- lapply(trees, filter_hashes) + flat_hashes <- character() + for(i in seq_along(hashes_tree)){ + flat_hashes <- append(flat_hashes, hashes_tree[[i]]$hash) + flat_hashes <- append(flat_hashes, hashes_tree[[i]]$hashes) + } + flat_hashes_df <- as.data.frame(table(flat_hashes)) + new_idx <- as.character(flat_hashes_df[order(flat_hashes_df$Freq), + "flat_hashes"]) + trees[new_idx] +} + unpack_packfile_repo <- function(parsed, output) { types <- unname(vcapply(parsed, "[[", "type")) trees <- parsed[types == "tree"] + trees <- sort_trees(trees) done <- logical(length(trees)) idx <- 1L wd <- character() From d26542680d77acd1d3228fb1a16d834c488ea522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 22 Oct 2023 13:05:40 +0200 Subject: [PATCH 2/4] git: list/unpack pack files in the right order Start from the root tree, which might not be the first one. Closes #339. --- R/git-protocol.R | 65 +++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/R/git-protocol.R b/R/git-protocol.R index e9e0d4df..8ae162c3 100644 --- a/R/git-protocol.R +++ b/R/git-protocol.R @@ -176,10 +176,10 @@ async_git_list_files <- function(url, ref = "HEAD") { sha2 <- ref async_git_resolve_ref(url, ref)$ then(function(sha) { sha2 <<- sha; async_git_fetch(url, sha) })$ - then(function(pf) async_git_list_files_process(pf, ref, sha2)) + then(function(pf) async_git_list_files_process(pf, ref, sha2, url)) } -async_git_list_files_process <- function(packfile, ref, sha) { +async_git_list_files_process <- function(packfile, ref, sha, url) { names(packfile) <- vcapply(packfile, "[[", "hash") types <- unname(vcapply(packfile, "[[", "type")) tree_sizes <- viapply(packfile, function(x) nrow(x$object) %||% NA_integer_) @@ -197,6 +197,9 @@ async_git_list_files_process <- function(packfile, ref, sha) { idx <- 1L wd <- character() + commit <- parse_commit(packfile[[which(types == "commit")]]$object) + tree <- commit[["tree"]] + process_tree <- function(i) { if (done[i]) return() done[i] <<- TRUE @@ -217,10 +220,20 @@ async_git_list_files_process <- function(packfile, ref, sha) { } } - for (i in seq_along(trees)) process_tree(i) - - commit <- parse_commit(packfile[[which(types == "commit")]]$object) - tree <- commit[["tree"]] + # start with the root tree + root <- match(tree, names(trees)) + if (is.na(root)) { + throw(pkg_error( + "Invalid git response from {.url {url}}, cannot find commit tree" + )) + } + process_tree(root) + if (any(!done)) { + warning( + "Some trees are unreachable when listing files from git repo from ", + url + ) + } list( ref = ref, @@ -504,32 +517,9 @@ async_git_download_repo_sha <- function(url, sha, output) { then(function(packfile) unpack_packfile_repo(packfile, output)) } -filter_hashes <- function(leaf) { - out <- list() - obs <- leaf$object - obs <- obs[obs$type == "tree", "hash"] - out[["hashes"]] <- obs - out$hash <- leaf$hash - out -} - -sort_trees <- function(trees){ - hashes_tree <- lapply(trees, filter_hashes) - flat_hashes <- character() - for(i in seq_along(hashes_tree)){ - flat_hashes <- append(flat_hashes, hashes_tree[[i]]$hash) - flat_hashes <- append(flat_hashes, hashes_tree[[i]]$hashes) - } - flat_hashes_df <- as.data.frame(table(flat_hashes)) - new_idx <- as.character(flat_hashes_df[order(flat_hashes_df$Freq), - "flat_hashes"]) - trees[new_idx] -} - unpack_packfile_repo <- function(parsed, output) { types <- unname(vcapply(parsed, "[[", "type")) trees <- parsed[types == "tree"] - trees <- sort_trees(trees) done <- logical(length(trees)) idx <- 1L wd <- character() @@ -561,6 +551,23 @@ unpack_packfile_repo <- function(parsed, output) { } } + commit <- parse_commit(parsed[[which(types == "commit")]]$object) + tree <- commit[["tree"]] + root <- match(tree, names(trees)) + if (is.na(root)) { + throw(pkg_error( + "Invalid git response from {.url {url}}, cannot find commit tree" + )) + } + process_tree(root) + if (any(!done)) { + warning( + "Some trees are unreachable when listing files from git repo from ", + url + ) + } + + for (i in seq_along(trees)) process_tree(i) invisible() From 97668efd087b29095dc28f205f39abaa4af16b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 23 Oct 2023 09:23:47 +0200 Subject: [PATCH 3/4] Fix error in unpack_packfile_repo() --- R/git-protocol.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/git-protocol.R b/R/git-protocol.R index 8ae162c3..0f41cb7a 100644 --- a/R/git-protocol.R +++ b/R/git-protocol.R @@ -514,10 +514,10 @@ async_git_download_repo <- function(url, ref = "HEAD", output = ref) { async_git_download_repo_sha <- function(url, sha, output) { url; sha; output async_git_fetch(url, sha, blobs = TRUE)$ - then(function(packfile) unpack_packfile_repo(packfile, output)) + then(function(packfile) unpack_packfile_repo(packfile, output, url)) } -unpack_packfile_repo <- function(parsed, output) { +unpack_packfile_repo <- function(parsed, output, url) { types <- unname(vcapply(parsed, "[[", "type")) trees <- parsed[types == "tree"] done <- logical(length(trees)) From 22d528b1f23d2fdfcf7596a9a22cf6ecfe8f10a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 23 Oct 2023 09:24:00 +0200 Subject: [PATCH 4/4] Add tests for git PACK file ordering --- tests/testthat/_snaps/git-protocol.md | 43 +++++++++++++++++++++++++++ tests/testthat/test-git-protocol.R | 34 +++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/tests/testthat/_snaps/git-protocol.md b/tests/testthat/_snaps/git-protocol.md index 4b39020d..487a6841 100644 --- a/tests/testthat/_snaps/git-protocol.md +++ b/tests/testthat/_snaps/git-protocol.md @@ -108,6 +108,37 @@ # i 14 more rows +# async_git_list_files_process + + Code + sort(async_git_list_files_process(pack, ref = ref, sha = ref, url = "url")$ + files$path) + Output + [1] ".github" + [2] ".github/actions" + [3] ".github/actions/parameters" + [4] ".github/actions/parameters/action.yml" + [5] ".github/workflows" + [6] ".github/workflows/check-standard.yaml" + [7] ".gitignore" + [8] "foo" + [9] "subdir" + [10] "subdir/dotenv" + [11] "subdir/dotenv/.Rbuildignore" + [12] "subdir/dotenv/.gitignore" + [13] "subdir/dotenv/DESCRIPTION" + [14] "subdir/dotenv/LICENSE" + [15] "subdir/dotenv/NAMESPACE" + [16] "subdir/dotenv/NEWS.md" + [17] "subdir/dotenv/R" + [18] "subdir/dotenv/R/dotenv-package.r" + [19] "subdir/dotenv/README.Rmd" + [20] "subdir/dotenv/README.md" + [21] "subdir/dotenv/man" + [22] "subdir/dotenv/man/dotenv-package.Rd" + [23] "subdir/dotenv/man/load_dot_env.Rd" + [24] "wipe.R" + # git_download_file Code @@ -700,3 +731,15 @@ [10] "v1/subdir/dotenv/man/load_dot_env.Rd" [11] "v1/wipe.R" +# unpack_packfile_repo + + Code + sort(dir(output, recursive = TRUE)) + Output + [1] "foo" "subdir/dotenv/DESCRIPTION" + [3] "subdir/dotenv/LICENSE" "subdir/dotenv/NAMESPACE" + [5] "subdir/dotenv/NEWS.md" "subdir/dotenv/R/dotenv-package.r" + [7] "subdir/dotenv/README.Rmd" "subdir/dotenv/README.md" + [9] "subdir/dotenv/man/dotenv-package.Rd" "subdir/dotenv/man/load_dot_env.Rd" + [11] "wipe.R" + diff --git a/tests/testthat/test-git-protocol.R b/tests/testthat/test-git-protocol.R index cc1bccad..2df1ec16 100644 --- a/tests/testthat/test-git-protocol.R +++ b/tests/testthat/test-git-protocol.R @@ -38,6 +38,24 @@ test_that("git_list_files", { }) }) +test_that("async_git_list_files_process", { + # reordering objects in a PACK file does not matter + ref <- "cefdc0eebcd7f757efb9a80652fd8aaf1a87508e" + pack <- git_fetch(fake_git$url("/pak-test.git"), ref, blobs = FALSE) + + withr::local_seed(13L) + pack <- sample(pack) + + expect_snapshot( + sort(async_git_list_files_process( + pack, + ref = ref, + sha = ref, + url = "url" + )$files$path) + ) +}) + test_that("git_download_file", { skip_on_cran() tmp <- tempfile() @@ -347,3 +365,19 @@ test_that("git_download_repo", { ) expect_snapshot(dir(tmp, recursive = TRUE)) }) + +test_that("unpack_packfile_repo", { + # reordering objects in a PACK file does not matter + ref <- "cefdc0eebcd7f757efb9a80652fd8aaf1a87508e" + pack <- git_fetch(fake_git$url("/pak-test.git"), ref, blobs = TRUE) + + withr::local_seed(13L) + pack <- sample(pack) + + output <- tempfile() + on.exit(unlink(output, recursive = TRUE), add = TRUE) + unpack_packfile_repo(pack, output, url = "url") + expect_snapshot( + sort(dir(output, recursive=TRUE)) + ) +})