From 677520aa8e8aa815cc743116c3a881064289458f Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Tue, 30 Apr 2024 09:00:14 +0200 Subject: [PATCH 1/5] Bazel: improved lazy lfs files This reintroduces lazy lfs file rules that were removed in https://github.com/github/codeql/pull/16117, now improved. The new rules will make the actual file download go through bazel's download manager, which includes: * caching into the repository cache * sane limiting of concurrent downloads * retries The bulk of the work is done by `git_lfs_probe.py`, which will use the LFS protocol (with authentication via SSH) to output short lived download URLs that can be consumed by `repository_ctx.download`. --- .lfsconfig | 5 ++ misc/bazel/internal/BUILD.bazel | 0 misc/bazel/internal/git_lfs_probe.py | 112 +++++++++++++++++++++++++++ misc/bazel/lfs.bzl | 79 +++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 .lfsconfig create mode 100644 misc/bazel/internal/BUILD.bazel create mode 100755 misc/bazel/internal/git_lfs_probe.py create mode 100644 misc/bazel/lfs.bzl diff --git a/.lfsconfig b/.lfsconfig new file mode 100644 index 000000000000..cb0a8e352e86 --- /dev/null +++ b/.lfsconfig @@ -0,0 +1,5 @@ +[lfs] +# codeql is publicly forked by many users, and we don't want any LFS file polluting their working +# copies. We therefore exclude everything by default. +# For files required by bazel builds, use rules in `misc/bazel/lfs.bzl` to download them on demand. +fetchinclude = /nothing diff --git a/misc/bazel/internal/BUILD.bazel b/misc/bazel/internal/BUILD.bazel new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/misc/bazel/internal/git_lfs_probe.py b/misc/bazel/internal/git_lfs_probe.py new file mode 100755 index 000000000000..3d0ed7679a34 --- /dev/null +++ b/misc/bazel/internal/git_lfs_probe.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +""" +Probe lfs files. +For each source file provided as output, this will print: +* "local", if the source file is not an LFS pointer +* the sha256 hash, a space character and a transient download link obtained via the LFS protocol otherwise +""" + +import sys +import pathlib +import subprocess +import os +import shutil +import json +import urllib.request +from urllib.parse import urlparse +import re + +sources = [pathlib.Path(arg).resolve() for arg in sys.argv[1:]] +source_dir = pathlib.Path(os.path.commonpath(src.parent for src in sources)) +source_dir = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True).strip() + + +def get_endpoint(): + lfs_env = subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir) + endpoint = ssh_server = ssh_path = None + endpoint_re = re.compile(r'Endpoint(?: \(\S+\))?=(\S+)') + ssh_re = re.compile(r'\s*SSH=(\S*):(.*)') + for line in lfs_env.splitlines(): + m = endpoint_re.match(line) + if m: + if endpoint is None: + endpoint = m[1] + else: + break + m = ssh_re.match(line) + if m: + ssh_server, ssh_path = m.groups() + break + assert endpoint, f"no Endpoint= line found in git lfs env:\n{lfs_env}" + headers = { + "Content-Type": "application/vnd.git-lfs+json", + "Accept": "application/vnd.git-lfs+json", + } + if ssh_server: + ssh_command = shutil.which(os.environ.get("GIT_SSH", os.environ.get("GIT_SSH_COMMAND", "ssh"))) + assert ssh_command, "no ssh command found" + with subprocess.Popen([ssh_command, ssh_server, "git-lfs-authenticate", ssh_path, "download"], + stdout=subprocess.PIPE) as ssh: + resp = json.load(ssh.stdout) + assert ssh.wait() == 0, "ssh command failed" + endpoint = resp.get("href", endpoint) + for k, v in resp.get("header", {}).items(): + headers[k.capitalize()] = v + url = urlparse(endpoint) + # this is how actions/checkout persist credentials + # see https://github.com/actions/checkout/blob/44c2b7a8a4ea60a981eaca3cf939b5f4305c123b/src/git-auth-helper.ts#L56-L63 + auth = subprocess.run(["git", "config", f"http.{url.scheme}://{url.netloc}/.extraheader"], text=True, + stdout=subprocess.PIPE, cwd=source_dir).stdout.strip() + for l in auth.splitlines(): + k, _, v = l.partition(": ") + headers[k.capitalize()] = v + if "GITHUB_TOKEN" in os.environ: + headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}" + return endpoint, headers + + +# see https://github.com/git-lfs/git-lfs/blob/310d1b4a7d01e8d9d884447df4635c7a9c7642c2/docs/api/basic-transfers.md +def get_locations(objects): + href, headers = get_endpoint() + indexes = [i for i, o in enumerate(objects) if o] + ret = ["local" for _ in objects] + req = urllib.request.Request( + f"{href}/objects/batch", + headers=headers, + data=json.dumps({ + "operation": "download", + "transfers": ["basic"], + "objects": [o for o in objects if o], + "hash_algo": "sha256", + }).encode("ascii"), + ) + with urllib.request.urlopen(req) as resp: + data = json.load(resp) + assert len(data["objects"]) == len(indexes), data + for i, resp in zip(indexes, data["objects"]): + ret[i] = f'{resp["oid"]} {resp["actions"]["download"]["href"]}' + return ret + + +def get_lfs_object(path): + with open(path, 'rb') as fileobj: + lfs_header = "version https://git-lfs.github.com/spec".encode() + actual_header = fileobj.read(len(lfs_header)) + sha256 = size = None + if lfs_header != actual_header: + return None + for line in fileobj: + line = line.decode('ascii').strip() + if line.startswith("oid sha256:"): + sha256 = line[len("oid sha256:"):] + elif line.startswith("size "): + size = int(line[len("size "):]) + if not (sha256 and line): + raise Exception("malformed pointer file") + return {"oid": sha256, "size": size} + + +objects = [get_lfs_object(src) for src in sources] +for resp in get_locations(objects): + print(resp) diff --git a/misc/bazel/lfs.bzl b/misc/bazel/lfs.bzl new file mode 100644 index 000000000000..0f6f981d861b --- /dev/null +++ b/misc/bazel/lfs.bzl @@ -0,0 +1,79 @@ +def lfs_smudge(repository_ctx, srcs): + for src in srcs: + repository_ctx.watch(src) + script = Label("//misc/bazel/internal:git_lfs_probe.py") + python = repository_ctx.which("python3") or repository_ctx.which("python") + if not python: + fail("Neither python3 nor python executables found") + res = repository_ctx.execute([python, script] + srcs, quiet = True) + if res.return_code != 0: + fail("git LFS probing failed while instantiating @%s:\n%s" % (repository_ctx.name, res.stderr)) + for src, loc in zip(srcs, res.stdout.splitlines()): + if loc == "local": + repository_ctx.symlink(src, src.basename) + else: + sha256, _, url = loc.partition(" ") + repository_ctx.download(url, src.basename, sha256 = sha256) + +def _download_and_extract_lfs(repository_ctx): + attr = repository_ctx.attr + src = repository_ctx.path(attr.src) + if attr.build_file_content and attr.build_file: + fail("You should specify only one among build_file_content and build_file for rule @%s" % repository_ctx.name) + lfs_smudge(repository_ctx, [src]) + repository_ctx.extract(src.basename, stripPrefix = attr.strip_prefix) + repository_ctx.delete(src.basename) + if attr.build_file_content: + repository_ctx.file("BUILD.bazel", attr.build_file_content) + elif attr.build_file: + repository_ctx.symlink(attr.build_file, "BUILD.bazel") + +def _download_lfs(repository_ctx): + attr = repository_ctx.attr + if int(bool(attr.srcs)) + int(bool(attr.dir)) != 1: + fail("Exactly one between `srcs` and `dir` must be defined for @%s" % repository_ctx.name) + if attr.srcs: + srcs = [repository_ctx.path(src) for src in attr.srcs] + else: + dir = repository_ctx.path(attr.dir) + if not dir.is_dir: + fail("`dir` not a directory in @%s" % repository_ctx.name) + srcs = [f for f in dir.readdir() if not f.is_dir] + lfs_smudge(repository_ctx, srcs) + + # with bzlmod the name is qualified with `~` separators, and we want the base name here + name = repository_ctx.name.split("~")[-1] + repository_ctx.file("BUILD.bazel", """ +exports_files({files}) + +filegroup( + name = "{name}", + srcs = {files}, + visibility = ["//visibility:public"], +) +""".format(name = name, files = repr([src.basename for src in srcs]))) + +lfs_archive = repository_rule( + doc = "Export the contents from an on-demand LFS archive. The corresponding path should be added to be ignored " + + "in `.lfsconfig`.", + implementation = _download_and_extract_lfs, + attrs = { + "src": attr.label(mandatory = True, doc = "Local path to the LFS archive to extract."), + "build_file_content": attr.string(doc = "The content for the BUILD file for this repository. " + + "Either build_file or build_file_content can be specified, but not both."), + "build_file": attr.label(doc = "The file to use as the BUILD file for this repository. " + + "Either build_file or build_file_content can be specified, but not both."), + "strip_prefix": attr.string(default = "", doc = "A directory prefix to strip from the extracted files. "), + }, +) + +lfs_files = repository_rule( + doc = "Export LFS files for on-demand download. Exactly one between `srcs` and `dir` must be defined. The " + + "corresponding paths should be added to be ignored in `.lfsconfig`.", + implementation = _download_lfs, + attrs = { + "srcs": attr.label_list(doc = "Local paths to the LFS files to export."), + "dir": attr.label(doc = "Local path to a directory containing LFS files to export. Only the direct contents " + + "of the directory are exported"), + }, +) From 9157dee0db4b2606bec4a0b0c1ec7e6b4067f442 Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Thu, 2 May 2024 08:53:51 +0200 Subject: [PATCH 2/5] Bazel: integrate `download_and_extract` into `lfs_smudge` --- misc/bazel/lfs.bzl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/misc/bazel/lfs.bzl b/misc/bazel/lfs.bzl index 0f6f981d861b..1f9c8972923f 100644 --- a/misc/bazel/lfs.bzl +++ b/misc/bazel/lfs.bzl @@ -1,4 +1,4 @@ -def lfs_smudge(repository_ctx, srcs): +def lfs_smudge(repository_ctx, srcs, extract = False, stripPrefix = None): for src in srcs: repository_ctx.watch(src) script = Label("//misc/bazel/internal:git_lfs_probe.py") @@ -10,19 +10,28 @@ def lfs_smudge(repository_ctx, srcs): fail("git LFS probing failed while instantiating @%s:\n%s" % (repository_ctx.name, res.stderr)) for src, loc in zip(srcs, res.stdout.splitlines()): if loc == "local": - repository_ctx.symlink(src, src.basename) + if extract: + repository_ctx.extract(src, stripPrefix = stripPrefix) + else: + repository_ctx.symlink(src, src.basename) else: sha256, _, url = loc.partition(" ") - repository_ctx.download(url, src.basename, sha256 = sha256) + if extract: + # we can't use skylib's `paths.split_extension`, as that only gets the last extension, so `.tar.gz` + # or similar wouldn't work + # it doesn't matter if file is something like some.name.zip and possible_extension == "name.zip", + # download_and_extract will just append ".name.zip" its internal temporary name, so extraction works + possible_extension = ".".join(src.basename.rsplit(".", 2)[-2:]) + repository_ctx.download_and_extract(url, sha256 = sha256, stripPrefix = stripPrefix, type = possible_extension) + else: + repository_ctx.download(url, src.basename, sha256 = sha256) def _download_and_extract_lfs(repository_ctx): attr = repository_ctx.attr src = repository_ctx.path(attr.src) if attr.build_file_content and attr.build_file: fail("You should specify only one among build_file_content and build_file for rule @%s" % repository_ctx.name) - lfs_smudge(repository_ctx, [src]) - repository_ctx.extract(src.basename, stripPrefix = attr.strip_prefix) - repository_ctx.delete(src.basename) + lfs_smudge(repository_ctx, [src], extract = True, stripPrefix = attr.strip_prefix) if attr.build_file_content: repository_ctx.file("BUILD.bazel", attr.build_file_content) elif attr.build_file: From daea674095ee9be1f4bddc360e8bb41f28ba788c Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Thu, 2 May 2024 18:02:22 +0200 Subject: [PATCH 3/5] Bazel: cover standard `https` git credentials in `git_lfs_probe.py` --- misc/bazel/internal/git_lfs_probe.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/misc/bazel/internal/git_lfs_probe.py b/misc/bazel/internal/git_lfs_probe.py index 3d0ed7679a34..1e5602ec8620 100755 --- a/misc/bazel/internal/git_lfs_probe.py +++ b/misc/bazel/internal/git_lfs_probe.py @@ -27,6 +27,7 @@ def get_endpoint(): endpoint = ssh_server = ssh_path = None endpoint_re = re.compile(r'Endpoint(?: \(\S+\))?=(\S+)') ssh_re = re.compile(r'\s*SSH=(\S*):(.*)') + credentials_re = re.compile(r'^password=(.*)$', re.M) for line in lfs_env.splitlines(): m = endpoint_re.match(line) if m: @@ -63,6 +64,15 @@ def get_endpoint(): headers[k.capitalize()] = v if "GITHUB_TOKEN" in os.environ: headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}" + if "Authorization" not in headers: + credentials = subprocess.run(["git", "credential", "fill"], cwd=source_dir, stdout=subprocess.PIPE, text=True, + input=f"protocol={url.scheme}\nhost={url.netloc}\npath={url.path[1:]}\n", + check=True).stdout + m = credentials_re.search(credentials) + if m: + headers["Authorization"] = f"token {m[1]}" + else: + print(f"WARNING: no auth credentials found for {endpoint}") return endpoint, headers From ecdf62376d9ad93e4e0e2e2f1f66ad94a9248327 Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Fri, 3 May 2024 09:11:28 +0200 Subject: [PATCH 4/5] Bazel: clean up `git_lfs_probe.py` --- misc/bazel/internal/git_lfs_probe.py | 112 +++++++++++++-------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/misc/bazel/internal/git_lfs_probe.py b/misc/bazel/internal/git_lfs_probe.py index 1e5602ec8620..cde509af7119 100755 --- a/misc/bazel/internal/git_lfs_probe.py +++ b/misc/bazel/internal/git_lfs_probe.py @@ -16,74 +16,78 @@ import urllib.request from urllib.parse import urlparse import re +import base64 +from dataclasses import dataclass + + +@dataclass +class Endpoint: + href: str + headers: dict[str, str] + + def update_headers(self, d: dict[str, str]): + self.headers.update((k.capitalize(), v) for k, v in d.items()) + sources = [pathlib.Path(arg).resolve() for arg in sys.argv[1:]] source_dir = pathlib.Path(os.path.commonpath(src.parent for src in sources)) source_dir = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True).strip() +def get_env(s, sep="="): + ret = {} + for m in re.finditer(fr'(.*?){sep}(.*)', s, re.M): + ret.setdefault(*m.groups()) + return ret + + +def git(*args, **kwargs): + return subprocess.run(("git",) + args, stdout=subprocess.PIPE, text=True, cwd=source_dir, **kwargs).stdout.strip() + + def get_endpoint(): - lfs_env = subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir) - endpoint = ssh_server = ssh_path = None - endpoint_re = re.compile(r'Endpoint(?: \(\S+\))?=(\S+)') - ssh_re = re.compile(r'\s*SSH=(\S*):(.*)') - credentials_re = re.compile(r'^password=(.*)$', re.M) - for line in lfs_env.splitlines(): - m = endpoint_re.match(line) - if m: - if endpoint is None: - endpoint = m[1] - else: - break - m = ssh_re.match(line) - if m: - ssh_server, ssh_path = m.groups() - break - assert endpoint, f"no Endpoint= line found in git lfs env:\n{lfs_env}" - headers = { + lfs_env = get_env(subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir)) + endpoint = next(v for k, v in lfs_env.items() if k.startswith('Endpoint')) + endpoint, _, _ = endpoint.partition(' ') + ssh_endpoint = lfs_env.get(" SSH") + endpoint = Endpoint(endpoint, { "Content-Type": "application/vnd.git-lfs+json", "Accept": "application/vnd.git-lfs+json", - } - if ssh_server: + }) + if ssh_endpoint: + # see https://github.com/git-lfs/git-lfs/blob/main/docs/api/authentication.md + server, _, path = ssh_endpoint.partition(":") ssh_command = shutil.which(os.environ.get("GIT_SSH", os.environ.get("GIT_SSH_COMMAND", "ssh"))) assert ssh_command, "no ssh command found" - with subprocess.Popen([ssh_command, ssh_server, "git-lfs-authenticate", ssh_path, "download"], - stdout=subprocess.PIPE) as ssh: - resp = json.load(ssh.stdout) - assert ssh.wait() == 0, "ssh command failed" - endpoint = resp.get("href", endpoint) - for k, v in resp.get("header", {}).items(): - headers[k.capitalize()] = v - url = urlparse(endpoint) + resp = json.loads(subprocess.check_output([ssh_command, server, "git-lfs-authenticate", path, "download"])) + endpoint.href = resp.get("href", endpoint) + endpoint.update_headers(resp.get("header", {})) + url = urlparse(endpoint.href) # this is how actions/checkout persist credentials # see https://github.com/actions/checkout/blob/44c2b7a8a4ea60a981eaca3cf939b5f4305c123b/src/git-auth-helper.ts#L56-L63 - auth = subprocess.run(["git", "config", f"http.{url.scheme}://{url.netloc}/.extraheader"], text=True, - stdout=subprocess.PIPE, cwd=source_dir).stdout.strip() - for l in auth.splitlines(): - k, _, v = l.partition(": ") - headers[k.capitalize()] = v + auth = git("config", f"http.{url.scheme}://{url.netloc}/.extraheader") + endpoint.update_headers(get_env(auth, sep=": ")) if "GITHUB_TOKEN" in os.environ: - headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}" - if "Authorization" not in headers: - credentials = subprocess.run(["git", "credential", "fill"], cwd=source_dir, stdout=subprocess.PIPE, text=True, - input=f"protocol={url.scheme}\nhost={url.netloc}\npath={url.path[1:]}\n", - check=True).stdout - m = credentials_re.search(credentials) - if m: - headers["Authorization"] = f"token {m[1]}" - else: - print(f"WARNING: no auth credentials found for {endpoint}") - return endpoint, headers + endpoint.headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}" + if "Authorization" not in endpoint.headers: + # last chance: use git credentials (possibly backed by a credential helper like the one installed by gh) + # see https://git-scm.com/docs/git-credential + credentials = get_env(git("credential", "fill", check=True, + # drop leading / from url.path + input=f"protocol={url.scheme}\nhost={url.netloc}\npath={url.path[1:]}\n")) + auth = base64.b64encode(f'{credentials["username"]}:{credentials["password"]}'.encode()).decode('ascii') + endpoint.headers["Authorization"] = f"Basic {auth}" + return endpoint # see https://github.com/git-lfs/git-lfs/blob/310d1b4a7d01e8d9d884447df4635c7a9c7642c2/docs/api/basic-transfers.md def get_locations(objects): - href, headers = get_endpoint() + endpoint = get_endpoint() indexes = [i for i, o in enumerate(objects) if o] ret = ["local" for _ in objects] req = urllib.request.Request( - f"{href}/objects/batch", - headers=headers, + f"{endpoint.href}/objects/batch", + headers=endpoint.headers, data=json.dumps({ "operation": "download", "transfers": ["basic"], @@ -93,7 +97,7 @@ def get_locations(objects): ) with urllib.request.urlopen(req) as resp: data = json.load(resp) - assert len(data["objects"]) == len(indexes), data + assert len(data["objects"]) == len(indexes), f"received {len(data)} objects, expected {len(indexes)}" for i, resp in zip(indexes, data["objects"]): ret[i] = f'{resp["oid"]} {resp["actions"]["download"]["href"]}' return ret @@ -106,14 +110,10 @@ def get_lfs_object(path): sha256 = size = None if lfs_header != actual_header: return None - for line in fileobj: - line = line.decode('ascii').strip() - if line.startswith("oid sha256:"): - sha256 = line[len("oid sha256:"):] - elif line.startswith("size "): - size = int(line[len("size "):]) - if not (sha256 and line): - raise Exception("malformed pointer file") + data = get_env(fileobj.read().decode('ascii'), sep=' ') + assert data['oid'].startswith('sha256:'), f"unknown oid type: {data['oid']}" + _, _, sha256 = data['oid'].partition(':') + size = int(data['size']) return {"oid": sha256, "size": size} From 6cbe16e0c2faa539f1e074e4a6a7d20c9dd93cf4 Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Fri, 3 May 2024 12:00:15 +0200 Subject: [PATCH 5/5] Bazel: add progress reporting --- misc/bazel/lfs.bzl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/misc/bazel/lfs.bzl b/misc/bazel/lfs.bzl index 1f9c8972923f..4ba66c9dbfc6 100644 --- a/misc/bazel/lfs.bzl +++ b/misc/bazel/lfs.bzl @@ -5,14 +5,18 @@ def lfs_smudge(repository_ctx, srcs, extract = False, stripPrefix = None): python = repository_ctx.which("python3") or repository_ctx.which("python") if not python: fail("Neither python3 nor python executables found") + repository_ctx.report_progress("querying LFS url(s) for: %s" % ", ".join([src.basename for src in srcs])) res = repository_ctx.execute([python, script] + srcs, quiet = True) if res.return_code != 0: fail("git LFS probing failed while instantiating @%s:\n%s" % (repository_ctx.name, res.stderr)) + promises = [] for src, loc in zip(srcs, res.stdout.splitlines()): if loc == "local": if extract: + repository_ctx.report_progress("extracting local %s" % src.basename) repository_ctx.extract(src, stripPrefix = stripPrefix) else: + repository_ctx.report_progress("symlinking local %s" % src.basename) repository_ctx.symlink(src, src.basename) else: sha256, _, url = loc.partition(" ") @@ -22,8 +26,10 @@ def lfs_smudge(repository_ctx, srcs, extract = False, stripPrefix = None): # it doesn't matter if file is something like some.name.zip and possible_extension == "name.zip", # download_and_extract will just append ".name.zip" its internal temporary name, so extraction works possible_extension = ".".join(src.basename.rsplit(".", 2)[-2:]) + repository_ctx.report_progress("downloading and extracting remote %s" % src.basename) repository_ctx.download_and_extract(url, sha256 = sha256, stripPrefix = stripPrefix, type = possible_extension) else: + repository_ctx.report_progress("downloading remote %s" % src.basename) repository_ctx.download(url, src.basename, sha256 = sha256) def _download_and_extract_lfs(repository_ctx):