From c1fa0e363903c0c23432d3526456ff5d708ff28b Mon Sep 17 00:00:00 2001 From: Paolo Tranquilli Date: Wed, 4 Sep 2024 14:00:29 +0200 Subject: [PATCH] Bazel: pull in some dependencies to the internal repo This allows to build the ruby and python packs (and the upcoming experimental rust one) from within `codeql`. --- misc/bazel/glibc_symbols_check.bzl | 19 +++ misc/bazel/internal/BUILD.bazel | 2 +- misc/bazel/internal/check_glibc_symbols.py | 172 +++++++++++++++++++++ misc/bazel/lipo.bzl | 44 ++++++ misc/bazel/platforms/BUILD.bazel | 66 ++++++++ misc/bazel/rust.bzl | 4 +- misc/bazel/transitions.bzl | 130 ++++++++++++++++ 7 files changed, 434 insertions(+), 3 deletions(-) create mode 100644 misc/bazel/glibc_symbols_check.bzl create mode 100755 misc/bazel/internal/check_glibc_symbols.py create mode 100644 misc/bazel/lipo.bzl create mode 100644 misc/bazel/platforms/BUILD.bazel create mode 100644 misc/bazel/transitions.bzl diff --git a/misc/bazel/glibc_symbols_check.bzl b/misc/bazel/glibc_symbols_check.bzl new file mode 100644 index 000000000000..910866633d66 --- /dev/null +++ b/misc/bazel/glibc_symbols_check.bzl @@ -0,0 +1,19 @@ +load("@rules_python//python:defs.bzl", "py_test") + +def glibc_symbols_check(name, binary): + """ + Checks that the supplied binary doesn't use symbols that are not available in older glibc versions. + """ + # Note this accesses system binaries that are not declared anywhere, + # thus breaking build hermeticity + + py_test( + name = name, + srcs = ["//misc/bazel/internal:check_glibc_symbols.py"], + main = "//misc/bazel/internal:check_glibc_symbols.py", + data = [binary], + args = ["$(location :%s)" % binary], + target_compatible_with = ["@platforms//os:linux", "//misc/bazel/platforms:bundled"], + size = "medium", + tags = ["glibc-symbols-check"], + ) diff --git a/misc/bazel/internal/BUILD.bazel b/misc/bazel/internal/BUILD.bazel index d9663e7f0c01..aebc7b610bcb 100644 --- a/misc/bazel/internal/BUILD.bazel +++ b/misc/bazel/internal/BUILD.bazel @@ -1 +1 @@ -exports_files(["install.py"]) +exports_files(glob(["*.py"])) diff --git a/misc/bazel/internal/check_glibc_symbols.py b/misc/bazel/internal/check_glibc_symbols.py new file mode 100755 index 000000000000..1942fa5132fa --- /dev/null +++ b/misc/bazel/internal/check_glibc_symbols.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python + +import re +import subprocess +import sys +import os +import argparse + +# Some of these limits are conservative and the required versions could be +# comfortably increased, especially if they're no newer than the versions that +# Java depends on. +default_limits = { + 'GCC': '3.0', + + # Default limit for versions of GLIBC symbols used by target program. + # GLIBC_2.17 was released on 2012-12-25. + # https://sourceware.org/glibc/wiki/Glibc%20Timeline + 'GLIBC': '2.17', + + # Default limit for versions of GLIBCXX (and GLIBCPP) symbols used + # by target program. GLIBCXX_3.4 implies at least libstdc++.6.0, + # and was adopted by GCC 3.4, release on 18/4/2004. + # https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html + 'GLIBCXX': '3.4', + 'GLIBCPP': '3.4', + 'CXXABI': '1.3', +} + +# List of libraries which must not be dynamically loaded +# On linux, the llvm libc++ libraries are statically linked and should not appear +# in the output of `ldd` +no_dynlink_libs = [ + "libstdc++", + "libc++", + "libc++abi", + "libunwind", +] + +def get_glibc_version(): + version = subprocess.check_output(['ldd', '--version']).decode("utf-8").split('\n')[0] + return float(version.split(' ')[-1]) + +def isTrue(var): + return var in os.environ and os.environ[var].lower() in ['true', 'yes', '1'] + +def memoize(f): + '''Memoize decorator''' + memo = {} + def helper(x): + if x not in memo: + memo[x] = f(x) + return memo[x] + return helper + +def normalise_ver(ver): + '''Convert a dot delimited numerical version string into list of integers. + + This conversion facilitates comparisons between version numbers.''' + return [int(p) for p in ver.split('.')] + +def too_new(cat, ver, limits): + '''Compare version string ver with the limit for cat. + Return True if ver is greater than the limit, or if there is no limit for cat.''' + if cat in limits: + limit = limits[cat] + return normalise_ver(ver) > normalise_ver(limit) + else: + return True + +@memoize +def get_libs(prog): + '''Get list of dynamically linked libraries''' + # Find paths to all libraries linked + re_lib = re.compile('^.* (/.*) \(0x[0-9a-f]+\)$') + try: + subprocess.check_output(['ldd', prog], stderr=subprocess.STDOUT).decode('utf-8').split('\n') + except subprocess.CalledProcessError as e: + # ldd will have a non zero exitcode of the binary is not dynamically linked. + return [] + except: + raise + + return [m.group(1) for m in [ re_lib.search(l) for l in subprocess.check_output(['ldd', prog]).decode('utf-8').split('\n') ] if m] + +def gather_min_symbol_versions(prog, limits): + '''Inspect the given executable 'prog' using `ldd` to discover which libraries it is linked + against. For each library, use `readelf` to discover the symbols therein, and for each + symbol with a GLIBC, GLIBCXX or GLIBCPP version record the latest version of each symbol + found that does not exceed the given limits, or the earliest available version if no + version is found within the limits. + + Return a dict mapping symbol names to strings of the form "GLIBC_2.5". The dict + thus indicates the earliest available versions of each symbol.''' + libs = get_libs(prog) + # Find earliest versions of all symbols + sym_ver = dict() + re_sym = re.compile('(\\w+)@+(.+)_([0-9.]+)') + for lib in libs: + for (sym, cat, ver) in re.findall(re_sym, subprocess.check_output(['readelf', '-Ws', lib]).decode('utf-8')): + if sym in sym_ver: + (cat2, ver2) = sym_ver[sym] + if cat != cat2: + raise Exception('Mismatched categories for symbol: ' + str(sym, cat, ver, cat2, ver2)) + if (normalise_ver(ver) < normalise_ver(ver2) and too_new(cat2, ver2, limits)) or \ + (normalise_ver(ver) > normalise_ver(ver2) and not too_new(cat, ver, limits)): + sym_ver[sym] = (cat, ver) + else: + sym_ver[sym] = (cat, ver) + return sym_ver + +def gather_linked_symbols(prog): + '''Inspect the given executable 'prog' using `nm` to discover which symbols it links, + and for each symbol with a GLIBC, GLIBCXX, or GLIBCPP version record the version + in a dict mapping symbol names to versions.''' + re_obj = re.compile('U (\\w+)@+(.+)_([0-9.]+)') + return re_obj.findall(subprocess.check_output(['nm', '-u', prog]).decode('utf-8')) + +def verify_dynlinked_libraries(prog): + '''Return the intersection set between dynamically linked libraries + that should not be dynamically loaded. See `no_dynlink_libs`.''' + libs = get_libs(prog) + bad_libs = [] + for lib in libs: + lib_name = os.path.basename(lib).split(".")[0] + if lib_name in no_dynlink_libs: + bad_libs += [lib] + + return bad_libs + +def main(): + if isTrue('CODEQL_SKIP_COMPATIBILITY') and not isTrue('CI'): + # Respect CODEQL_SKIP_COMPATIBILITY which tells us to skip this check, unless we are on CI + sys.exit(0) + + # Verify parameters + parser = argparse.ArgumentParser() + parser.add_argument("program") + # create outfile - this is needed for the bazel aspect integration + parser.add_argument("output", nargs="?", type=argparse.FileType('w')) + prog = parser.parse_args().program + + # Gather versions of symbols actually linked + prog_symbols = gather_linked_symbols(prog) + # Check whether any symbols exceed the maximum version restrictions + bad_syms = [ (sym, cat, ver) for sym, cat, ver in prog_symbols if too_new(cat, ver, default_limits) ] + if bad_syms != []: + # Scan for minimum versions of symbols available in linked libraries + available_symbols = gather_min_symbol_versions(prog, default_limits) + for sym, cat, ver in bad_syms: + print(sym + ' is too new or from an unknown category: it requires ' + cat + '_' + ver + + ', but we are limited to ' + str(default_limits)) + if sym in available_symbols: + (cat, ver) = available_symbols[sym] + if not too_new(cat, ver, default_limits): + print('\tconsider adding: SET_GLIBC_VERSION(%s_%s,%s) { ... } to glibc_compatibility.cpp, ' % (cat, ver, sym)) + print('\tand add \'-Wl,--wrap=%s\' when linking. ' % (sym)) + else: + print('\tThe earliest available symbol has version %s_%s' % (cat, ver)) + + bad_libs = verify_dynlinked_libraries(prog) + if bad_libs != []: + print("Binary dynamically links against:") + for bad_lib in bad_libs: + print("\t%s" % (bad_lib)) + print("These libraries should be statically linked on linux") + + if bad_syms != [] or bad_libs != []: + sys.exit(1) + sys.exit(0) + +if __name__ == '__main__': + main() diff --git a/misc/bazel/lipo.bzl b/misc/bazel/lipo.bzl new file mode 100644 index 000000000000..15801810d388 --- /dev/null +++ b/misc/bazel/lipo.bzl @@ -0,0 +1,44 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load("//misc/bazel:transitions.bzl", "forward_binary_from_transition", "get_transition_attrs", "universal_binary_transition") + +def _universal_binary_impl(ctx): + # Two cases: Either we're on macos, and we need to lipo the two binaries that the transition generated + # together, or we're on another platform, where we just copy along the binary, and forward the DefaultInfo data + binaries = [dep[DefaultInfo].files_to_run.executable for dep in ctx.attr.dep] + if len(binaries) == 0: + fail("No executable inputs found") + + (_, extension) = paths.split_extension(binaries[0].basename) + new_executable = ctx.actions.declare_file(ctx.label.name + extension) + + # We're using a split transition on the `dep` attribute on macos. If we are on macos, that has the function that + # a) ctx.split_attr has two entries (if we need to retrieve the per-architecture binaries), and that + # ctx.addr.dep is a list with two elements - one for each platform. + # While not using a split transition, ctx.attr.dep is a list with one element, as we just have a single platform. + # We use this to distinguish whether we should lipo the binaries together, or just forward the binary. + if len(binaries) == 1: + return forward_binary_from_transition(ctx) + else: + ctx.actions.run_shell( + inputs = binaries, + outputs = [new_executable], + command = "lipo -create %s -output %s" % (" ".join([binary.path for binary in binaries]), new_executable.path), + ) + files = depset(direct = [new_executable]) + runfiles = ctx.runfiles([new_executable]).merge_all([dep[DefaultInfo].default_runfiles for dep in ctx.attr.dep]) + + providers = [ + DefaultInfo( + files = files, + runfiles = runfiles, + executable = new_executable, + ), + ] + return providers + +universal_binary = rule( + implementation = _universal_binary_impl, + attrs = get_transition_attrs(universal_binary_transition), + doc = """On macOS: Create a universal (fat) binary from the input rule, by applying two transitions and lipoing the result together. + No-op on other platforms, just forward the binary.""", +) diff --git a/misc/bazel/platforms/BUILD.bazel b/misc/bazel/platforms/BUILD.bazel new file mode 100644 index 000000000000..2e28de49a1ac --- /dev/null +++ b/misc/bazel/platforms/BUILD.bazel @@ -0,0 +1,66 @@ +package(default_visibility = ["//visibility:public"]) + +constraint_setting(name = "host_or_bundled") + +constraint_value( + name = "bundled", + constraint_setting = ":host_or_bundled", +) + +constraint_value( + name = "host", + constraint_setting = ":host_or_bundled", +) + +alias( + name = "detected", + actual = "@local_config_platform//:host", +) + +platform( + name = "bundled_toolchain", + constraint_values = [":bundled"], + parents = [":detected"], +) + +platform( + name = "bundled_toolchain_arm64", + constraint_values = ["@platforms//cpu:arm64"], + parents = [":bundled_toolchain"], +) + +platform( + name = "bundled_toolchain_x86_64", + constraint_values = ["@platforms//cpu:x86_64"], + parents = [":bundled_toolchain"], +) + +platform( + name = "bundled_toolchain_x86_32", + constraint_values = ["@platforms//cpu:x86_32"], + parents = [":bundled_toolchain"], +) + +platform( + name = "host_toolchain", + constraint_values = [":host"], + parents = [":detected"], +) + +platform( + name = "host_toolchain_arm64", + constraint_values = ["@platforms//cpu:arm64"], + parents = [":host_toolchain"], +) + +platform( + name = "host_toolchain_x86_64", + constraint_values = ["@platforms//cpu:x86_64"], + parents = [":host_toolchain"], +) + +platform( + name = "host_toolchain_x86_32", + constraint_values = ["@platforms//cpu:x86_32"], + parents = [":host_toolchain"], +) diff --git a/misc/bazel/rust.bzl b/misc/bazel/rust.bzl index b858f6b0a80c..f7b33ded9a40 100644 --- a/misc/bazel/rust.bzl +++ b/misc/bazel/rust.bzl @@ -1,6 +1,6 @@ load("@rules_rust//rust:defs.bzl", "rust_binary") -load("@semmle_code//buildutils-internal:glibc_symbols_check.bzl", "glibc_symbols_check") -load("@semmle_code//buildutils-internal:lipo.bzl", "universal_binary") +load("//misc/bazel:glibc_symbols_check.bzl", "glibc_symbols_check") +load("//misc/bazel:lipo.bzl", "universal_binary") def codeql_rust_binary( name, diff --git a/misc/bazel/transitions.bzl b/misc/bazel/transitions.bzl new file mode 100644 index 000000000000..2f4fc67cffaf --- /dev/null +++ b/misc/bazel/transitions.bzl @@ -0,0 +1,130 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") + +def _make_platform_transition(impl): + return transition( + implementation = impl, + inputs = ["//command_line_option:platforms"], + outputs = ["//command_line_option:platforms"], + ) + +def _platform_transition_impl(_settings, attr): + return { + "//command_line_option:platforms": [attr.platform], + } + +# Transition to attr.platform +platform_transition = _make_platform_transition(_platform_transition_impl) + +def _get_platform_for_arch(settings, arch): + platform = str(settings["//command_line_option:platforms"][0]) + if "host_toolchain" in platform: + return "//toolchain/platforms:host_toolchain_%s" % arch + return "//toolchain/platforms:bundled_toolchain_%s" % arch + +def _x86_32_transition_impl(settings, _attr): + return {"//command_line_option:platforms": [_get_platform_for_arch(settings, "x86_32")]} + +x86_32_transition = _make_platform_transition(_x86_32_transition_impl) + +def _x86_64_transition_impl(settings, _attr): + return {"//command_line_option:platforms": [_get_platform_for_arch(settings, "x86_64")]} + +x86_64_transition = _make_platform_transition(_x86_64_transition_impl) + +def _arm64_transition_impl(settings, _attr): + return {"//command_line_option:platforms": [_get_platform_for_arch(settings, "arm64")]} + +arm64_transition = _make_platform_transition(_arm64_transition_impl) + +def get_transition_attrs(transition_rule): + return { + "_allowlist_function_transition": attr.label( + default = "@bazel_tools//tools/allowlists/function_transition_allowlist", + ), + "dep": attr.label(mandatory = True, cfg = transition_rule), + } + +def _universal_binary_transition_impl(settings, _attr): + # Create a split transition from any macOS cpu to a list of all macOS cpus + # Do nothing on other platforms, so that the lipo transition is a no-op + if settings["//command_line_option:cpu"].startswith("darwin"): + return { + "x86_64": {"//command_line_option:platforms": [_get_platform_for_arch(settings, "x86_64")]}, + "arm64": {"//command_line_option:platforms": [_get_platform_for_arch(settings, "arm64")]}, + } + else: + return None + +universal_binary_transition = transition( + implementation = _universal_binary_transition_impl, + inputs = [ + "//command_line_option:cpu", + "//command_line_option:platforms", + ], + outputs = ["//command_line_option:platforms"], +) + +def forward_binary_from_transition(ctx): + # We need to forward the DefaultInfo provider from the underlying rule. + # However, we can't do so directly, so instead we need to copy the binary over + binary = ctx.attr.dep[0] + default_info = binary[DefaultInfo] + original_executable = default_info.files_to_run.executable + runfiles = default_info.default_runfiles + if not original_executable: + fail("Cannot transition a 'binary' that is not executable") + + (_, extension) = paths.split_extension(original_executable.basename) + new_executable = ctx.actions.declare_file(ctx.label.name + extension) + command = "cp %s %s" % (original_executable.path, new_executable.path) + + # when transitioning a dylib, we also need to change the internal name to make it usable by bazel + if extension == ".dylib": + command += "\ninstall_name_tool -id %s %s" % (new_executable.path, new_executable.path) + + providers = [] + inputs = [original_executable] + if OutputGroupInfo in binary: + pdb_file = getattr(binary[OutputGroupInfo], "pdb_file", None) + if pdb_file: + (pdb_file,) = pdb_file.to_list() + linked_pdb_file = ctx.actions.declare_file(ctx.label.name + ".pdb") + ctx.actions.symlink(target_file = pdb_file, output = linked_pdb_file) + + # let's put this link into the copy inputs, even if unused + # this will force the file to be created even if not explicitly included in outputs + inputs.append(linked_pdb_file) + + ctx.actions.run_shell( + inputs = inputs, + outputs = [new_executable], + command = command, + ) + files = depset(direct = [new_executable]) + runfiles = runfiles.merge(ctx.runfiles([new_executable])) + + providers.append( + DefaultInfo( + files = files, + runfiles = runfiles, + executable = new_executable, + ), + ) + + return providers + +cc_compile_as_x86_32 = rule( + implementation = forward_binary_from_transition, + attrs = get_transition_attrs(x86_32_transition), +) + +# needed to force certain dependencies of 32-bit binaries to be compiled as 64-bit binaries +cc_compile_as_x86_64 = rule( + implementation = forward_binary_from_transition, + attrs = get_transition_attrs(x86_64_transition), +) + +cc_compile_as_arm64 = rule( + implementation = forward_binary_from_transition, + attrs = get_transition_attrs(arm64_transition), +)