From df6267fdfab784fa7961ca011bfdf49e32398bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Barri=C3=A9?= Date: Mon, 29 Jan 2024 16:24:20 +0100 Subject: [PATCH] Revalidate stale entries using a digest. Ref: https://github.com/Shopify/bootsnap/issues/336 Bootsnap was initially designed for improving boot time in development, so it was logical to use `mtime` to detect changes given that's reliable on a given machine. But is just as useful on production and CI environments, however there its hit rate can vary a lot because depending on how the source code and caches are saved and restored, many if not all `mtime` will have changed. To improve this, we can first try to revalidate using the `mtime`, and if it fails, fallback to compare a digest of the file content. Digesting a file, even with `fnv1a_64` is of course an overhead, but the assumption is that true misses should be relatively rare and that digesting the file will always be faster than compiling it. So even if it only improve the hit rate marginally, it should be faster overall. Also we only recompute the digest if the file mtime changed, but its size remained the same, which should discard the overwhelming majority of legitimate source file changes. Co-authored-by: Jean Boussier --- CHANGELOG.md | 3 + README.md | 2 +- ext/bootsnap/bootsnap.c | 169 +++++++++++++++--- ext/bootsnap/extconf.rb | 2 + test/compile_cache_key_format_test.rb | 2 +- test/compile_cache_test.rb | 48 ++++- .../core_ext/kernel_require_test.rb | 8 +- 7 files changed, 197 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 788470b..8feb82f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Unreleased +* Revalidate stale cache entries by digesting the source content. + This should significantly improve performance in environments where `mtime` isn't preserved (e.g. CI systems doing a git clone, etc). + See #468. * Open source files and cache entries with `O_NOATIME` when available to reduce disk accesses. See #469. * `bootsnap precompile --gemfile` now look for `.rb` files in the whole gem and not just the `lib/` directory. See #466. diff --git a/README.md b/README.md index 08167de..333607f 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ Bootsnap cache misses can be monitored though a callback: Bootsnap.instrumentation = ->(event, path) { puts "#{event} #{path}" } ``` -`event` is either `:miss` or `:stale`. You can also call `Bootsnap.log!` as a shortcut to +`event` is either `:miss`, `:stale` or `:revalidated`. You can also call `Bootsnap.log!` as a shortcut to log all events to STDERR. To turn instrumentation back off you can set it to nil: diff --git a/ext/bootsnap/bootsnap.c b/ext/bootsnap/bootsnap.c index 439c2ac..412fa48 100644 --- a/ext/bootsnap/bootsnap.c +++ b/ext/bootsnap/bootsnap.c @@ -58,8 +58,10 @@ struct bs_cache_key { uint32_t ruby_revision; uint64_t size; uint64_t mtime; - uint64_t data_size; /* not used for equality */ - uint8_t pad[24]; + uint64_t data_size; // + uint64_t digest; + uint8_t digest_set; + uint8_t pad[15]; } __attribute__((packed)); /* @@ -73,7 +75,7 @@ struct bs_cache_key { STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE); /* Effectively a schema version. Bumping invalidates all previous caches */ -static const uint32_t current_version = 4; +static const uint32_t current_version = 5; /* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a * new OS ABI, etc. */ @@ -93,6 +95,7 @@ static VALUE rb_cBootsnap_CompileCache_UNCOMPILABLE; static ID instrumentation_method; static VALUE sym_miss; static VALUE sym_stale; +static VALUE sym_revalidated; static bool instrumentation_enabled = false; static bool readonly = false; @@ -104,9 +107,18 @@ static VALUE bs_rb_fetch(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handl static VALUE bs_rb_precompile(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handler); /* Helpers */ +enum cache_status { + miss, + hit, + stale, +}; static void bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_CACHEPATH_SIZE]); static int bs_read_key(int fd, struct bs_cache_key * key); -static int cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2); +static enum cache_status cache_key_equal_fast_path(struct bs_cache_key * k1, struct bs_cache_key * k2); +static int cache_key_equal_slow_path(struct bs_cache_key * current_key, struct bs_cache_key * cached_key, const VALUE input_data); +static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance); + +static void bs_cache_key_digest(struct bs_cache_key * key, const VALUE input_data); static VALUE bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args); static VALUE bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler); static int open_current_file(char * path, struct bs_cache_key * key, const char ** errno_provenance); @@ -171,6 +183,9 @@ Init_bootsnap(void) sym_stale = ID2SYM(rb_intern("stale")); rb_global_variable(&sym_stale); + sym_revalidated = ID2SYM(rb_intern("revalidated")); + rb_global_variable(&sym_revalidated); + rb_define_module_function(rb_mBootsnap, "instrumentation_enabled=", bs_instrumentation_enabled_set, 1); rb_define_module_function(rb_mBootsnap_CompileCache_Native, "readonly=", bs_readonly_set, 1); rb_define_module_function(rb_mBootsnap_CompileCache_Native, "coverage_running?", bs_rb_coverage_running, 0); @@ -189,6 +204,14 @@ bs_instrumentation_enabled_set(VALUE self, VALUE enabled) return enabled; } +static inline void +bs_instrumentation(VALUE event, VALUE path) +{ + if (RB_UNLIKELY(instrumentation_enabled)) { + rb_funcall(rb_mBootsnap, instrumentation_method, 2, event, path); + } +} + static VALUE bs_readonly_set(VALUE self, VALUE enabled) { @@ -294,17 +317,53 @@ bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_C * The data_size member is not compared, as it serves more of a "header" * function. */ -static int -cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2) +static enum cache_status cache_key_equal_fast_path(struct bs_cache_key *k1, + struct bs_cache_key *k2) { + if (k1->version == k2->version && + k1->ruby_platform == k2->ruby_platform && + k1->compile_option == k2->compile_option && + k1->ruby_revision == k2->ruby_revision && k1->size == k2->size) { + return (k1->mtime == k2->mtime) ? hit : stale; + } + return miss; +} + +static int cache_key_equal_slow_path(struct bs_cache_key *current_key, + struct bs_cache_key *cached_key, + const VALUE input_data) { - return ( - k1->version == k2->version && - k1->ruby_platform == k2->ruby_platform && - k1->compile_option == k2->compile_option && - k1->ruby_revision == k2->ruby_revision && - k1->size == k2->size && - k1->mtime == k2->mtime - ); + bs_cache_key_digest(current_key, input_data); + return current_key->digest == cached_key->digest; +} + +static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance) +{ + lseek(cache_fd, 0, SEEK_SET); + ssize_t nwrite = write(cache_fd, current_key, KEY_SIZE); + if (nwrite < 0) { + *errno_provenance = "update_cache_key:write"; + return -1; + } + +#ifdef HAVE_FDATASYNC + if (fdatasync(cache_fd) < 0) { + *errno_provenance = "update_cache_key:fdatasync"; + return -1; + } +#endif + + return 0; +} + +/* + * Fills the cache key digest. + */ +static void bs_cache_key_digest(struct bs_cache_key *key, + const VALUE input_data) { + if (key->digest_set) + return; + key->digest = fnv1a_64(input_data); + key->digest_set = 1; } /* @@ -393,6 +452,7 @@ open_current_file(char * path, struct bs_cache_key * key, const char ** errno_pr key->ruby_revision = current_ruby_revision; key->size = (uint64_t)statbuf.st_size; key->mtime = (uint64_t)statbuf.st_mtime; + key->digest_set = false; return fd; } @@ -436,7 +496,12 @@ open_cache_file(const char * path, struct bs_cache_key * key, const char ** errn { int fd, res; - fd = open(path, O_RDONLY | O_NOATIME); + if (readonly) { + fd = open(path, O_RDONLY | O_NOATIME); + } else { + fd = open(path, O_RDWR | O_NOATIME); + } + if (fd < 0) { *errno_provenance = "bs_fetch:open_cache_file:open"; return CACHE_MISS; @@ -681,7 +746,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args int res, valid_cache = 0, exception_tag = 0; const char * errno_provenance = NULL; - VALUE input_data; /* data read from source file, e.g. YAML or ruby source */ + VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */ VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */ VALUE output_data; /* return data, e.g. ruby hash or loaded iseq */ @@ -699,20 +764,43 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args cache_fd = open_cache_file(cache_path, &cached_key, &errno_provenance); if (cache_fd == CACHE_MISS || cache_fd == CACHE_STALE) { /* This is ok: valid_cache remains false, we re-populate it. */ - if (RB_UNLIKELY(instrumentation_enabled)) { - rb_funcall(rb_mBootsnap, instrumentation_method, 2, cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v); - } + bs_instrumentation(cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v); } else if (cache_fd < 0) { exception_message = rb_str_new_cstr(cache_path); goto fail_errno; } else { /* True if the cache existed and no invalidating changes have occurred since * it was generated. */ - valid_cache = cache_key_equal(¤t_key, &cached_key); - if (RB_UNLIKELY(instrumentation_enabled)) { - if (!valid_cache) { - rb_funcall(rb_mBootsnap, instrumentation_method, 2, sym_stale, path_v); + + switch(cache_key_equal_fast_path(¤t_key, &cached_key)) { + case hit: + valid_cache = true; + break; + case miss: + valid_cache = false; + break; + case stale: + valid_cache = false; + if ((input_data = bs_read_contents(current_fd, current_key.size, + &errno_provenance)) == Qfalse) { + exception_message = path_v; + goto fail_errno; + } + valid_cache = cache_key_equal_slow_path(¤t_key, &cached_key, input_data); + if (valid_cache) { + if (!readonly) { + if (update_cache_key(¤t_key, cache_fd, &errno_provenance)) { + exception_message = path_v; + goto fail_errno; + } + } + bs_instrumentation(sym_revalidated, path_v); } + break; + }; + + if (!valid_cache) { + bs_instrumentation(sym_stale, path_v); } } @@ -726,7 +814,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args else if (res == CACHE_UNCOMPILABLE) { /* If fetch_cached_data returned `Uncompilable` we fallback to `input_to_output` This happens if we have say, an unsafe YAML cache, but try to load it in safe mode */ - if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){ + if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) { exception_message = path_v; goto fail_errno; } @@ -745,7 +833,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args /* Cache is stale, invalid, or missing. Regenerate and write it out. */ /* Read the contents of the source file into a buffer */ - if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){ + if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) { exception_message = path_v; goto fail_errno; } @@ -767,6 +855,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args * We do however ignore any failures to persist the cache, as it's better * to move along, than to interrupt the process. */ + bs_cache_key_digest(¤t_key, input_data); atomic_write_cache_file(cache_path, ¤t_key, storage_data, &errno_provenance); /* Having written the cache, now convert storage_data to output_data */ @@ -822,12 +911,16 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args static VALUE bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler) { + if (readonly) { + return Qfalse; + } + struct bs_cache_key cached_key, current_key; int cache_fd = -1, current_fd = -1; int res, valid_cache = 0, exception_tag = 0; const char * errno_provenance = NULL; - VALUE input_data; /* data read from source file, e.g. YAML or ruby source */ + VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */ VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */ /* Open the source file and generate a cache key for it */ @@ -843,7 +936,28 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler) } else { /* True if the cache existed and no invalidating changes have occurred since * it was generated. */ - valid_cache = cache_key_equal(¤t_key, &cached_key); + switch(cache_key_equal_fast_path(¤t_key, &cached_key)) { + case hit: + valid_cache = true; + break; + case miss: + valid_cache = false; + break; + case stale: + valid_cache = false; + if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) { + goto fail; + } + valid_cache = cache_key_equal_slow_path(¤t_key, &cached_key, input_data); + if (valid_cache) { + if (update_cache_key(¤t_key, cache_fd, &errno_provenance)) { + goto fail; + } + + bs_instrumentation(sym_revalidated, path_v); + } + break; + }; } if (valid_cache) { @@ -870,6 +984,7 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler) if (!RB_TYPE_P(storage_data, T_STRING)) goto fail; /* Write the cache key and storage_data to the cache directory */ + bs_cache_key_digest(¤t_key, input_data); res = atomic_write_cache_file(cache_path, ¤t_key, storage_data, &errno_provenance); if (res < 0) goto fail; diff --git a/ext/bootsnap/extconf.rb b/ext/bootsnap/extconf.rb index 7e4dca5..5d61d38 100644 --- a/ext/bootsnap/extconf.rb +++ b/ext/bootsnap/extconf.rb @@ -3,6 +3,8 @@ require "mkmf" if %w[ruby truffleruby].include?(RUBY_ENGINE) + have_func "fdatasync", "fcntl.h" + unless RUBY_PLATFORM.match?(/mswin|mingw|cygwin/) append_cppflags ["_GNU_SOURCE"] # Needed of O_NOATIME end diff --git a/test/compile_cache_key_format_test.rb b/test/compile_cache_key_format_test.rb index 3817587..5108374 100644 --- a/test/compile_cache_key_format_test.rb +++ b/test/compile_cache_key_format_test.rb @@ -22,7 +22,7 @@ class CompileCacheKeyFormatTest < Minitest::Test def test_key_version key = cache_key_for_file(FILE) - exp = [4].pack("L") + exp = [5].pack("L") assert_equal(exp, key[R[:version]]) end diff --git a/test/compile_cache_test.rb b/test/compile_cache_test.rb index d17452f..5ed37cf 100644 --- a/test/compile_cache_test.rb +++ b/test/compile_cache_test.rb @@ -9,6 +9,7 @@ class CompileCacheTest < Minitest::Test def teardown super Bootsnap::CompileCache::Native.readonly = false + Bootsnap.instrumentation = nil end def test_compile_option_crc32 @@ -158,6 +159,34 @@ def test_dont_store_cache_after_a_stale_when_readonly load(path) end + def test_dont_revalidate_when_readonly + path = Help.set_file("a.rb", "a = a = 3", 100) + load(path) + + entries = Dir["#{Bootsnap::CompileCache::ISeq.cache_dir}/**/*"].select { |f| File.file?(f) } + assert_equal 1, entries.size + cache_entry = entries.first + old_cache_content = File.binread(cache_entry) + + Bootsnap::CompileCache::Native.readonly = true + + output = RubyVM::InstructionSequence.compile_file(path) + Bootsnap::CompileCache::ISeq.expects(:input_to_storage).never + Bootsnap::CompileCache::ISeq.expects(:storage_to_output).once.returns(output) + Bootsnap::CompileCache::ISeq.expects(:input_to_output).never + + FileUtils.touch(path, mtime: File.mtime(path) + 50) + + calls = [] + Bootsnap.instrumentation = ->(event, source_path) { calls << [event, source_path] } + load(path) + + assert_equal [[:revalidated, "a.rb"]], calls + + new_cache_content = File.binread(cache_entry) + assert_equal old_cache_content, new_cache_content, "Cache entry was mutated" + end + def test_invalid_cache_file path = Help.set_file("a.rb", "a = a = 3", 100) cp = Help.cache_path("#{@tmp_dir}-iseq", path) @@ -177,8 +206,6 @@ def test_instrumentation_hit load(file_path) assert_equal [], calls - ensure - Bootsnap.instrumentation = nil end def test_instrumentation_miss @@ -190,8 +217,19 @@ def test_instrumentation_miss load(file_path) assert_equal [[:miss, "a.rb"]], calls - ensure - Bootsnap.instrumentation = nil + end + + def test_instrumentation_revalidate + file_path = Help.set_file("a.rb", "a = a = 3", 100) + load(file_path) + FileUtils.touch("a.rb", mtime: File.mtime("a.rb") + 42) + + calls = [] + Bootsnap.instrumentation = ->(event, path) { calls << [event, path] } + + load(file_path) + + assert_equal [[:revalidated, "a.rb"]], calls end def test_instrumentation_stale @@ -205,7 +243,5 @@ def test_instrumentation_stale load(file_path) assert_equal [[:stale, "a.rb"]], calls - ensure - Bootsnap.instrumentation = nil end end diff --git a/test/load_path_cache/core_ext/kernel_require_test.rb b/test/load_path_cache/core_ext/kernel_require_test.rb index dc120ea..4aa1b7e 100644 --- a/test/load_path_cache/core_ext/kernel_require_test.rb +++ b/test/load_path_cache/core_ext/kernel_require_test.rb @@ -12,7 +12,7 @@ def test_uses_the_same_duck_type_as_require assert_nil LoadPathCache.load_path_cache cache = Tempfile.new("cache") pid = Process.fork do - LoadPathCache.setup(cache_path: cache, development_mode: true, ignore_directories: nil) + LoadPathCache.setup(cache_path: cache.path, development_mode: true, ignore_directories: nil) dir = File.realpath(Dir.mktmpdir) $LOAD_PATH.push(dir) FileUtils.touch("#{dir}/a.rb") @@ -40,7 +40,7 @@ def test_load_static_libaries assert_nil LoadPathCache.load_path_cache cache = Tempfile.new("cache") pid = Process.fork do - LoadPathCache.setup(cache_path: cache, development_mode: false, ignore_directories: nil) + LoadPathCache.setup(cache_path: cache.path, development_mode: false, ignore_directories: nil) require("prism") end _, status = Process.wait2(pid) @@ -53,7 +53,10 @@ def test_load_static_libaries end class KernelLoadTest < Minitest::Test + include TmpdirHelper + def setup + super @initial_dir = Dir.pwd @dir1 = File.realpath(Dir.mktmpdir) FileUtils.touch("#{@dir1}/a.rb") @@ -70,6 +73,7 @@ def teardown Dir.chdir(@initial_dir) FileUtils.rm_rf(@dir1) FileUtils.rm_rf(@dir2) + super end def test_no_exstensions_for_kernel_load