Skip to content

Commit

Permalink
Merge pull request #468 from Shopify/revalidate-mtime
Browse files Browse the repository at this point in the history
Revalidate cache based on source digest
  • Loading branch information
casperisfine authored Jan 30, 2024
2 parents c12002f + df6267f commit 4a91add
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 37 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Unreleased

* Revalidate stale cache entries by digesting the source content.
This should significantly improve performance in environments where `mtime` isn't preserved (e.g. CI systems doing a git clone, etc).
See #468.
* Open source files and cache entries with `O_NOATIME` when available to reduce disk accesses. See #469.
* `bootsnap precompile --gemfile` now look for `.rb` files in the whole gem and not just the `lib/` directory. See #466.

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ Bootsnap cache misses can be monitored though a callback:
Bootsnap.instrumentation = ->(event, path) { puts "#{event} #{path}" }
```

`event` is either `:miss` or `:stale`. You can also call `Bootsnap.log!` as a shortcut to
`event` is either `:miss`, `:stale` or `:revalidated`. You can also call `Bootsnap.log!` as a shortcut to
log all events to STDERR.

To turn instrumentation back off you can set it to nil:
Expand Down
169 changes: 142 additions & 27 deletions ext/bootsnap/bootsnap.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ struct bs_cache_key {
uint32_t ruby_revision;
uint64_t size;
uint64_t mtime;
uint64_t data_size; /* not used for equality */
uint8_t pad[24];
uint64_t data_size; //
uint64_t digest;
uint8_t digest_set;
uint8_t pad[15];
} __attribute__((packed));

/*
Expand All @@ -73,7 +75,7 @@ struct bs_cache_key {
STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE);

/* Effectively a schema version. Bumping invalidates all previous caches */
static const uint32_t current_version = 4;
static const uint32_t current_version = 5;

/* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a
* new OS ABI, etc. */
Expand All @@ -93,6 +95,7 @@ static VALUE rb_cBootsnap_CompileCache_UNCOMPILABLE;
static ID instrumentation_method;
static VALUE sym_miss;
static VALUE sym_stale;
static VALUE sym_revalidated;
static bool instrumentation_enabled = false;
static bool readonly = false;

Expand All @@ -104,9 +107,18 @@ static VALUE bs_rb_fetch(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handl
static VALUE bs_rb_precompile(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handler);

/* Helpers */
enum cache_status {
miss,
hit,
stale,
};
static void bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_CACHEPATH_SIZE]);
static int bs_read_key(int fd, struct bs_cache_key * key);
static int cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2);
static enum cache_status cache_key_equal_fast_path(struct bs_cache_key * k1, struct bs_cache_key * k2);
static int cache_key_equal_slow_path(struct bs_cache_key * current_key, struct bs_cache_key * cached_key, const VALUE input_data);
static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance);

static void bs_cache_key_digest(struct bs_cache_key * key, const VALUE input_data);
static VALUE bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args);
static VALUE bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler);
static int open_current_file(char * path, struct bs_cache_key * key, const char ** errno_provenance);
Expand Down Expand Up @@ -171,6 +183,9 @@ Init_bootsnap(void)
sym_stale = ID2SYM(rb_intern("stale"));
rb_global_variable(&sym_stale);

sym_revalidated = ID2SYM(rb_intern("revalidated"));
rb_global_variable(&sym_revalidated);

rb_define_module_function(rb_mBootsnap, "instrumentation_enabled=", bs_instrumentation_enabled_set, 1);
rb_define_module_function(rb_mBootsnap_CompileCache_Native, "readonly=", bs_readonly_set, 1);
rb_define_module_function(rb_mBootsnap_CompileCache_Native, "coverage_running?", bs_rb_coverage_running, 0);
Expand All @@ -189,6 +204,14 @@ bs_instrumentation_enabled_set(VALUE self, VALUE enabled)
return enabled;
}

static inline void
bs_instrumentation(VALUE event, VALUE path)
{
if (RB_UNLIKELY(instrumentation_enabled)) {
rb_funcall(rb_mBootsnap, instrumentation_method, 2, event, path);
}
}

static VALUE
bs_readonly_set(VALUE self, VALUE enabled)
{
Expand Down Expand Up @@ -294,17 +317,53 @@ bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_C
* The data_size member is not compared, as it serves more of a "header"
* function.
*/
static int
cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2)
static enum cache_status cache_key_equal_fast_path(struct bs_cache_key *k1,
struct bs_cache_key *k2) {
if (k1->version == k2->version &&
k1->ruby_platform == k2->ruby_platform &&
k1->compile_option == k2->compile_option &&
k1->ruby_revision == k2->ruby_revision && k1->size == k2->size) {
return (k1->mtime == k2->mtime) ? hit : stale;
}
return miss;
}

static int cache_key_equal_slow_path(struct bs_cache_key *current_key,
struct bs_cache_key *cached_key,
const VALUE input_data)
{
return (
k1->version == k2->version &&
k1->ruby_platform == k2->ruby_platform &&
k1->compile_option == k2->compile_option &&
k1->ruby_revision == k2->ruby_revision &&
k1->size == k2->size &&
k1->mtime == k2->mtime
);
bs_cache_key_digest(current_key, input_data);
return current_key->digest == cached_key->digest;
}

static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance)
{
lseek(cache_fd, 0, SEEK_SET);
ssize_t nwrite = write(cache_fd, current_key, KEY_SIZE);
if (nwrite < 0) {
*errno_provenance = "update_cache_key:write";
return -1;
}

#ifdef HAVE_FDATASYNC
if (fdatasync(cache_fd) < 0) {
*errno_provenance = "update_cache_key:fdatasync";
return -1;
}
#endif

return 0;
}

/*
* Fills the cache key digest.
*/
static void bs_cache_key_digest(struct bs_cache_key *key,
const VALUE input_data) {
if (key->digest_set)
return;
key->digest = fnv1a_64(input_data);
key->digest_set = 1;
}

/*
Expand Down Expand Up @@ -393,6 +452,7 @@ open_current_file(char * path, struct bs_cache_key * key, const char ** errno_pr
key->ruby_revision = current_ruby_revision;
key->size = (uint64_t)statbuf.st_size;
key->mtime = (uint64_t)statbuf.st_mtime;
key->digest_set = false;

return fd;
}
Expand Down Expand Up @@ -436,7 +496,12 @@ open_cache_file(const char * path, struct bs_cache_key * key, const char ** errn
{
int fd, res;

fd = open(path, O_RDONLY | O_NOATIME);
if (readonly) {
fd = open(path, O_RDONLY | O_NOATIME);
} else {
fd = open(path, O_RDWR | O_NOATIME);
}

if (fd < 0) {
*errno_provenance = "bs_fetch:open_cache_file:open";
return CACHE_MISS;
Expand Down Expand Up @@ -681,7 +746,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
int res, valid_cache = 0, exception_tag = 0;
const char * errno_provenance = NULL;

VALUE input_data; /* data read from source file, e.g. YAML or ruby source */
VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */
VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */
VALUE output_data; /* return data, e.g. ruby hash or loaded iseq */

Expand All @@ -699,20 +764,43 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
cache_fd = open_cache_file(cache_path, &cached_key, &errno_provenance);
if (cache_fd == CACHE_MISS || cache_fd == CACHE_STALE) {
/* This is ok: valid_cache remains false, we re-populate it. */
if (RB_UNLIKELY(instrumentation_enabled)) {
rb_funcall(rb_mBootsnap, instrumentation_method, 2, cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v);
}
bs_instrumentation(cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v);
} else if (cache_fd < 0) {
exception_message = rb_str_new_cstr(cache_path);
goto fail_errno;
} else {
/* True if the cache existed and no invalidating changes have occurred since
* it was generated. */
valid_cache = cache_key_equal(&current_key, &cached_key);
if (RB_UNLIKELY(instrumentation_enabled)) {
if (!valid_cache) {
rb_funcall(rb_mBootsnap, instrumentation_method, 2, sym_stale, path_v);

switch(cache_key_equal_fast_path(&current_key, &cached_key)) {
case hit:
valid_cache = true;
break;
case miss:
valid_cache = false;
break;
case stale:
valid_cache = false;
if ((input_data = bs_read_contents(current_fd, current_key.size,
&errno_provenance)) == Qfalse) {
exception_message = path_v;
goto fail_errno;
}
valid_cache = cache_key_equal_slow_path(&current_key, &cached_key, input_data);
if (valid_cache) {
if (!readonly) {
if (update_cache_key(&current_key, cache_fd, &errno_provenance)) {
exception_message = path_v;
goto fail_errno;
}
}
bs_instrumentation(sym_revalidated, path_v);
}
break;
};

if (!valid_cache) {
bs_instrumentation(sym_stale, path_v);
}
}

Expand All @@ -726,7 +814,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
else if (res == CACHE_UNCOMPILABLE) {
/* If fetch_cached_data returned `Uncompilable` we fallback to `input_to_output`
This happens if we have say, an unsafe YAML cache, but try to load it in safe mode */
if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){
if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) {
exception_message = path_v;
goto fail_errno;
}
Expand All @@ -745,7 +833,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
/* Cache is stale, invalid, or missing. Regenerate and write it out. */

/* Read the contents of the source file into a buffer */
if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){
if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) {
exception_message = path_v;
goto fail_errno;
}
Expand All @@ -767,6 +855,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
* We do however ignore any failures to persist the cache, as it's better
* to move along, than to interrupt the process.
*/
bs_cache_key_digest(&current_key, input_data);
atomic_write_cache_file(cache_path, &current_key, storage_data, &errno_provenance);

/* Having written the cache, now convert storage_data to output_data */
Expand Down Expand Up @@ -822,12 +911,16 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
static VALUE
bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler)
{
if (readonly) {
return Qfalse;
}

struct bs_cache_key cached_key, current_key;
int cache_fd = -1, current_fd = -1;
int res, valid_cache = 0, exception_tag = 0;
const char * errno_provenance = NULL;

VALUE input_data; /* data read from source file, e.g. YAML or ruby source */
VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */
VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */

/* Open the source file and generate a cache key for it */
Expand All @@ -843,7 +936,28 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler)
} else {
/* True if the cache existed and no invalidating changes have occurred since
* it was generated. */
valid_cache = cache_key_equal(&current_key, &cached_key);
switch(cache_key_equal_fast_path(&current_key, &cached_key)) {
case hit:
valid_cache = true;
break;
case miss:
valid_cache = false;
break;
case stale:
valid_cache = false;
if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) {
goto fail;
}
valid_cache = cache_key_equal_slow_path(&current_key, &cached_key, input_data);
if (valid_cache) {
if (update_cache_key(&current_key, cache_fd, &errno_provenance)) {
goto fail;
}

bs_instrumentation(sym_revalidated, path_v);
}
break;
};
}

if (valid_cache) {
Expand All @@ -870,6 +984,7 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler)
if (!RB_TYPE_P(storage_data, T_STRING)) goto fail;

/* Write the cache key and storage_data to the cache directory */
bs_cache_key_digest(&current_key, input_data);
res = atomic_write_cache_file(cache_path, &current_key, storage_data, &errno_provenance);
if (res < 0) goto fail;

Expand Down
2 changes: 2 additions & 0 deletions ext/bootsnap/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
require "mkmf"

if %w[ruby truffleruby].include?(RUBY_ENGINE)
have_func "fdatasync", "fcntl.h"

unless RUBY_PLATFORM.match?(/mswin|mingw|cygwin/)
append_cppflags ["_GNU_SOURCE"] # Needed of O_NOATIME
end
Expand Down
2 changes: 1 addition & 1 deletion test/compile_cache_key_format_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class CompileCacheKeyFormatTest < Minitest::Test

def test_key_version
key = cache_key_for_file(FILE)
exp = [4].pack("L")
exp = [5].pack("L")
assert_equal(exp, key[R[:version]])
end

Expand Down
Loading

0 comments on commit 4a91add

Please sign in to comment.