Skip to content

Commit

Permalink
Unpacker: key_cache option.
Browse files Browse the repository at this point in the history
Heavily inspired from ruby/json#675.

When parsing documents with the same structure repeatedly, a lot
of time can be saved by keeping a small cache of map keys encountered.

Using the existing `bench/bench.rb`, comparing with and without the
cache shows a 30% performance improvement:

```
Calculating -------------------------------------
       unpack-pooled    960.380k (± 1.4%) i/s -      4.865M in   5.066600s
    unpack-key-cache      1.245M (± 1.6%) i/s -      6.232M in   5.009060s

Comparison:
       unpack-pooled:   960379.8 i/s
    unpack-key-cache:  1244517.6 i/s - 1.30x  (± 0.00) faster
```

However, on the same benchmark, but with the cache filled with other
keys, the performance is notably degraded:

```
Calculating -------------------------------------
       unpack-pooled    926.849k (± 2.1%) i/s -      4.639M in   5.007333s
    unpack-key-cache    822.266k (± 2.4%) i/s -      4.113M in   5.004645s

Comparison:
       unpack-pooled:   926849.2 i/s
    unpack-key-cache:   822265.6 i/s - 1.13x  (± 0.00) slower
```

So this feature is powerful but situational.
  • Loading branch information
byroot committed Nov 13, 2024
1 parent 83a2600 commit ed4491f
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 8 deletions.
2 changes: 2 additions & 0 deletions doclib/msgpack/unpacker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Unpacker
# Supported options:
#
# * *:symbolize_keys* deserialize keys of Hash objects as Symbol instead of String
# * *:freeze* freeze the deserialized objects. Can allow string deduplication and some allocation elision.
# * *:key_cache* Enable caching of map keys, this can improve performance significantly if the same map keys are frequently encountered, but also degrade performance if that's not the case.
# * *:allow_unknown_ext* allow to deserialize ext type object with unknown type id as ExtensionValue instance. Otherwise (by default), unpacker throws UnknownExtTypeError.
#
# See also Buffer#initialize for other options.
Expand Down
127 changes: 127 additions & 0 deletions ext/msgpack/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,4 +473,131 @@ static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
}

// Hash keys are likely to be repeated, and are frozen.
// As such we can re-use them if we keep a cache of the ones we've seen so far,
// and save much more expensive lookups into the global fstring table.
// This cache implementation is deliberately simple, as we're optimizing for compactness,
// to be able to fit easily embeded inside msgpack_unpacker_t.
// As such, binary search into a sorted array gives a good tradeoff between compactness and
// performance.
#define MSGPACK_KEY_CACHE_CAPACITY 63

typedef struct msgpack_key_cache_t msgpack_key_cache_t;
struct msgpack_key_cache_t {
int length;
VALUE entries[MSGPACK_KEY_CACHE_CAPACITY];
};

static inline VALUE build_interned_string(const char *str, const long length)
{
# ifdef HAVE_RB_ENC_INTERNED_STR
return rb_enc_interned_str(str, length, rb_utf8_encoding());
# else
VALUE rstring = rb_utf8_str_new(str, length);
return rb_funcall(rb_str_freeze(rstring), s_uminus, 0);
# endif
}

static inline VALUE build_symbol(const char *str, const long length)
{
return rb_str_intern(build_interned_string(str, length));
}

static void rvalue_cache_insert_at(msgpack_key_cache_t *cache, int index, VALUE rstring)
{
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
cache->length++;
cache->entries[index] = rstring;
}

static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
{
long rstring_length = RSTRING_LEN(rstring);
if (length == rstring_length) {
return memcmp(str, RSTRING_PTR(rstring), length);
} else {
return (int)(length - rstring_length);
}
}

static VALUE rstring_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
{
int low = 0;
int high = cache->length - 1;
int mid = 0;
int last_cmp = 0;

while (low <= high) {
mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
last_cmp = rstring_cache_cmp(str, length, entry);

if (last_cmp == 0) {
return entry;
} else if (last_cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}

VALUE rstring = build_interned_string(str, length);

if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
if (last_cmp > 0) {
mid += 1;
}

rvalue_cache_insert_at(cache, mid, rstring);
}
return rstring;
}

static VALUE rsymbol_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
{
int low = 0;
int high = cache->length - 1;
int mid = 0;
int last_cmp = 0;

while (low <= high) {
mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));

if (last_cmp == 0) {
return entry;
} else if (last_cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}

VALUE rsymbol = build_symbol(str, length);

if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
if (last_cmp > 0) {
mid += 1;
}

rvalue_cache_insert_at(cache, mid, rsymbol);
}
return rsymbol;
}

static inline VALUE msgpack_buffer_read_top_as_interned_symbol(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
{
VALUE result = rsymbol_cache_fetch(cache, b->read_buffer, length);
_msgpack_buffer_consumed(b, length);
return result;
}

static inline VALUE msgpack_buffer_read_top_as_interned_string(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
{
VALUE result = rstring_cache_fetch(cache, b->read_buffer, length);
_msgpack_buffer_consumed(b, length);
return result;
}

#endif
34 changes: 30 additions & 4 deletions ext/msgpack/unpacker.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,20 @@ void msgpack_unpacker_mark_stack(msgpack_unpacker_stack_t* stack)
}
}

void msgpack_unpacker_mark_key_cache(msgpack_key_cache_t *cache)
{
int index;
for (index = 0; index < cache->length; index++) {
rb_gc_mark(cache->entries[index]);
}
}

void msgpack_unpacker_mark(msgpack_unpacker_t* uk)
{
rb_gc_mark(uk->last_object);
rb_gc_mark(uk->reading_raw);
msgpack_unpacker_mark_stack(&uk->stack);
msgpack_unpacker_mark_key_cache(&uk->key_cache);
/* See MessagePack_Buffer_wrap */
/* msgpack_buffer_mark(UNPACKER_BUFFER_(uk)); */
rb_gc_mark(uk->buffer_ref);
Expand Down Expand Up @@ -374,15 +383,32 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
size_t length = uk->reading_raw_remaining;
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
int ret;
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type)) {
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY);
ret = object_complete_symbol(uk, symbol);
} else if (is_reading_map_key(uk) && raw_type == RAW_TYPE_STRING) {
/* don't use zerocopy for hash keys but get a frozen string directly
* because rb_hash_aset freezes keys and it causes copying */
VALUE key;
if (uk->symbolize_keys) {
if (uk->use_key_cache) {
key = msgpack_buffer_read_top_as_interned_symbol(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
} else {
key = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, true);
}
ret = object_complete_symbol(uk, key);
} else {
if (uk->use_key_cache) {
key = msgpack_buffer_read_top_as_interned_string(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
} else {
key = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, true, true);
}

ret = object_complete(uk, key);
}
} else {
bool will_freeze = uk->freeze;
if(raw_type == RAW_TYPE_STRING || raw_type == RAW_TYPE_BINARY) {
/* don't use zerocopy for hash keys but get a frozen string directly
* because rb_hash_aset freezes keys and it causes copying */
will_freeze = will_freeze || is_reading_map_key(uk);
VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, will_freeze, raw_type == RAW_TYPE_STRING);
ret = object_complete(uk, string);
} else {
Expand Down
16 changes: 12 additions & 4 deletions ext/msgpack/unpacker.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct msgpack_unpacker_stack_t {
struct msgpack_unpacker_t {
msgpack_buffer_t buffer;
msgpack_unpacker_stack_t stack;
msgpack_key_cache_t key_cache;

VALUE self;
VALUE last_object;
Expand All @@ -66,10 +67,12 @@ struct msgpack_unpacker_t {

/* options */
int symbol_ext_type;
bool symbolize_keys;
bool freeze;
bool allow_unknown_ext;
bool optimized_symbol_ext_type;

bool use_key_cache: 1;
bool symbolize_keys: 1;
bool freeze: 1;
bool allow_unknown_ext: 1;
bool optimized_symbol_ext_type: 1;
};

#define UNPACKER_BUFFER_(uk) (&(uk)->buffer)
Expand Down Expand Up @@ -101,6 +104,11 @@ static inline void msgpack_unpacker_set_symbolized_keys(msgpack_unpacker_t* uk,
uk->symbolize_keys = enable;
}

static inline void msgpack_unpacker_set_key_cache(msgpack_unpacker_t* uk, bool enable)
{
uk->use_key_cache = enable;
}

static inline void msgpack_unpacker_set_freeze(msgpack_unpacker_t* uk, bool enable)
{
uk->freeze = enable;
Expand Down
5 changes: 5 additions & 0 deletions ext/msgpack/unpacker_class.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ static VALUE eUnknownExtTypeError;
static VALUE mTypeError; // obsoleted. only for backward compatibility. See #86.

static VALUE sym_symbolize_keys;
static VALUE sym_key_cache;
static VALUE sym_freeze;
static VALUE sym_allow_unknown_ext;

Expand Down Expand Up @@ -128,6 +129,9 @@ VALUE MessagePack_Unpacker_initialize(int argc, VALUE* argv, VALUE self)
if(options != Qnil) {
VALUE v;

v = rb_hash_aref(options, sym_key_cache);
msgpack_unpacker_set_key_cache(uk, RTEST(v));

v = rb_hash_aref(options, sym_symbolize_keys);
msgpack_unpacker_set_symbolized_keys(uk, RTEST(v));

Expand Down Expand Up @@ -413,6 +417,7 @@ void MessagePack_Unpacker_module_init(VALUE mMessagePack)
eUnknownExtTypeError = rb_define_class_under(mMessagePack, "UnknownExtTypeError", eUnpackError);

sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
sym_key_cache = ID2SYM(rb_intern("key_cache"));
sym_freeze = ID2SYM(rb_intern("freeze"));
sym_allow_unknown_ext = ID2SYM(rb_intern("allow_unknown_ext"));

Expand Down

0 comments on commit ed4491f

Please sign in to comment.