diff --git a/docs/implementation_details.adoc b/docs/implementation_details.adoc
new file mode 100644
index 00000000..483ced7b
--- /dev/null
+++ b/docs/implementation_details.adoc
@@ -0,0 +1,157 @@
+
+== Implementation Details
+
+This section details some of the internal implementation details to assist contributors.
+The details here are not required to use the `cib` library.
+
+=== Run Length Encoded Message Indices
+
+To switch to using the RLE indices is as simple as converting your `msg::indexed_service` to a
+`msg::rle_indexed_service`.
+
+The initial building of the mapping indices proceeds the same as
+the normal ones, where a series of entries in an index is generated
+and the callback that match are encoded into a `stdx::bitset`.
+
+However, once this initial representation is built, we then take this and
+perform additional work (at compile time) to encode the bitsets as RLE
+data, and store in the index just an offset into the blob of RLE data
+rather than the bitset itself.
+
+This is good for message maps that contain a large number of handlers as
+we trade off storage space for some decoding overhead.
+
+Once encoded, the normal operation of the lookup process at run time
+proceeds and a set of candidate matches is collected, these are then
+_intersected_ from the RLE data and the final set of callbacks invoked
+without needing to materialise any of the underlying bitsets.
+
+==== RLE Data Encoding
+
+There are several options for encoding the bitset into an RLE pattern, many of which will result
+in smaller size, but a lot of bit-shifting to extract data. We have chosen to trade off encoded
+size for faster decoding, as it is likely the handling of the RLE data and index lookup will be
+in the critical path for system state changes.
+
+The encoding chosen is simply the number of consecutive bits of `0`​s or `1`​s.
+
+Specifics:
+
+- The encoding runs from the least significant bit to most significant bit
+- The number of consecutive bits is stored as a `std::byte` and ranges `0...255`
+- The first byte of the encoding counts the number of `0` bits
+- If there are more than 255 consecutive identical bits, they can only be encoded in
+  blocks of 255, and an additional 0 is needed to indicate zero opposite bits are needed.
+
+[ditaa, format="svg", scale=1.5]
+----
+  Bitset            RLE Data
+/-------------+    +---+
+| 0b0000`0000 |--->| 8 |
++-------------/    +---+
+
+/-------------+    +---+---+
+| 0b1111`1111 |--->| 0 | 8 |
++-------------/    +---+---+
+
+/-------------+    +---+---+---+
+| 0b0000`0001 |--->| 0 | 1 | 7 |
++-------------/    +---+---+---+
+
+/-------------+    +---+---+---+---+
+| 0b1000`0011 |--->| 0 | 2 | 5 | 1 |
++-------------/    +---+---+---+---+
+
+/-------------+    +---+---+---+---+
+| 0b1100`1110 |--->| 1 | 3 | 2 | 2 |
++-------------/    +---+---+---+---+
+
+
+/------------------------------+    +---+---+-----+---+-----+---+-----+---+-----+
+| 1000 `0`s and one `1` in LSB |--->| 0 | 1 | 255 | 0 | 255 | 0 | 255 | 0 | 235 |
++------------------------------/    +---+---+-----+---+-----+---+-----+---+-----+
+----
+
+The `msg::rle_indexed_builder` will go through a process to take the indices and
+their bitset data and build a single blob of RLE encoded data for all indices, stored in
+an instance of a `msg::detail::rle_storage`. It also generates a set of
+`msg::detail::rle_index` entries for each of the index entries that maps the original bitmap
+to a location in the shared storage blob.
+
+The `rle_storage` object contains a simple array of all RLE data bytes. The `rle_index`
+contains a simple offset into that array. We compute the smallest size that can contain the
+offset to avoid wasted storage and use that.
+
+NOTE: The specific `rle_storage` and `rle_index`​s are locked together using a unique type
+so that the `rle_index` can not be used with the wrong `rle_storage` object.
+
+When building the shared blob, the encoder will attempt to reduce the storage size by finding
+and reusing repeated patterns in the RLE data.
+
+The final `msg::indexed_handler` contains an instance of the `msg::rle_indices` which contains
+both the storage and the maps referring to all the `rle_index` objects.
+
+This means that the final compile time data generated consists of:
+
+- The Message Map lookups as per the normal implementation, however they store a simple offset
+  rather than a bitset.
+- The blob of all RLE bitset data for all indices in the message handling map
+
+==== Runtime Handling
+
+The `msg::indexed_handler` implementation will delegate the mapping call for an incoming
+message down to the `msg::rle_indices` implementation. It will further call into it's
+storage indices and match to the set of `rle_index` values for each mapping index.
+
+This set of `rle_index` values (which are just offsets) are then converted to instances of
+a `msg::detail::rle_decoder` by the `rle_storage`. This converts the offset into a
+pointer to the sequence of `std::byte`​s for the RLE encoding.
+
+All the collected `rle_decoders` from the various maps in the set of indices are then passed
+to an instance of the `msg::detail::rle_intersect` object and returned from the `rle_indices`
+call operator.
+
+The `rle_decoder` provides a single-use enumerator that will step over the groups of
+`0`​s or `1`​s, providing a way to advance through them by arbitrary increments.
+
+The `rle_intersect` implementation wraps the variadic set of `rle_decoder`​s so that
+the caller can iterate through all `1`​s, calling the appropriate callback as it goes.
+
+===== Efficient Iteration of Bits
+
+The `msg::detail::rle_decoder::chunk_enumerator` provides a way to step through the RLE
+data for the encoded bitset an arbitrary number of bits at a time. It does this by exposing
+the current number of bits of consecutive value.
+
+This is presented so that it is possible to efficiently find:
+
+- the longest run of `0`​s
+- or, if none, the shortest run of `1`​s.
+
+Remember that we are trying to compute the intersection of all the encoded bitsets, so
+where all bitsets have a `1`, we call the associated callback, where any of the bitsets
+has a `0`, we skip that callback.
+
+So the `chunk_enumerator` will return a signed 16 bit (at least) value indicating:
+
+- *negative* value - the number of `0`​s
+- *positive* value - the number of `1`​s
+- *zero* when past the end (special case)
+
+The `rle_intersect` will initialise an array of `rle_decoder::chunk_enumerators`
+when it is asked to run a lambda for each `1` bit using the `for_each()` method.
+
+This list is then searched for the _minimum_ value of chunk size. This will either
+be the largest negative value, and so the longest run of `0`​s, or the smallest
+number of `1`​s, representing the next set of bits that are set in all bitsets.
+
+The `for_each()` method will then advance past all the `0`​s, or execute the lambda
+for that many set bits, until it has consumed all bits in the encoded bitsets.
+
+This means that the cost of intersection of `N` indices is a number of pointers and
+a small amount of state for tracking the current run of bits and their type for each index.
+
+There is no need to materialise a full bitset at all. This can be quite a memory saving if
+there are a large number of callbacks. The trade-off, of course, is more complex iteration
+of bits to discover the callbacks to run.
+
diff --git a/docs/index.adoc b/docs/index.adoc
index 4e255a29..15ec6dfa 100644
--- a/docs/index.adoc
+++ b/docs/index.adoc
@@ -11,3 +11,4 @@ include::flows.adoc[]
 include::interrupts.adoc[]
 include::match.adoc[]
 include::message.adoc[]
+include::implementation_details.adoc[]
diff --git a/docs/message.adoc b/docs/message.adoc
index 4ef63767..a8d44d27 100644
--- a/docs/message.adoc
+++ b/docs/message.adoc
@@ -181,7 +181,7 @@ cib::service<my_service>->handle(my_message{"my field"_field = 0x80});
 
 Notice in this case that our callback is defined with a matcher that always
 matches, but also that the field in `my_message` has a matcher that requires it
-to equal `0x80`. Therefore handling the following message will not call the
+to equal `0x80`. Therefore, handling the following message will not call the
 callback:
 [source,cpp]
 ----
@@ -242,7 +242,12 @@ minimal effort at runtime.
 For each field in the `msg::index_spec`, we build a map from field values to
 bitsets, where the values in the bitsets represent callback indices.
 
-NOTE: The bitsets may be run-length encoded: this is a work in progress.
+NOTE: The bitsets may be run-length encoded by using the `rle_indexed_service`
+inplace of the `indexed_service`. This may be useful if you have limited space
+and/or a large set of possible callbacks.
+See xref:implementation_details.adoc#run_length_encoded_message_indices[Run Length
+Encoding Implementation Details]
+
 
 Each `indexed_callback` has a matcher that may be an
 xref:match.adoc#_boolean_algebra_with_matchers[arbitrary Boolean matcher
@@ -442,4 +447,4 @@ compile time.
 
 For each callback, we now run the remaining matcher expression to deal with any
 unindexed but constrained fields, and call the callback if it passes. Bob's your
-uncle.
+uncle.
\ No newline at end of file
diff --git a/include/msg/detail/rle_codec.hpp b/include/msg/detail/rle_codec.hpp
index f0fe5019..b54d25f4 100644
--- a/include/msg/detail/rle_codec.hpp
+++ b/include/msg/detail/rle_codec.hpp
@@ -16,29 +16,112 @@
 
 namespace msg::detail {
 
-template <std::size_t N> struct smallest_storage {
-    // select a minimum sized type for indexing into the RLE data blob
-    static CONSTEVAL auto select_index_storage() {
-        if constexpr (N <= std::numeric_limits<std::uint8_t>::max()) {
-            return std::uint8_t{};
-        } else if constexpr (N <= std::numeric_limits<std::uint16_t>::max()) {
-            return std::uint16_t{};
-        } else if constexpr (N <= std::numeric_limits<std::uint32_t>::max()) {
-            return std::uint32_t{};
-        } else {
-            return std::size_t{};
+template <std::size_t N>
+using smallest_storage_type = decltype(stdx::detail::select_storage<N, void>());
+
+// Captures RLE data for decoding and provides a mechanism to
+// get a single-use enumerator to step through bits in the encoded
+// bitset.
+template <typename BitSetType> class rle_decoder {
+  public:
+    using bitset_type = BitSetType;
+    constexpr static auto num_bits = BitSetType::size();
+
+    constexpr explicit rle_decoder(std::byte const *start_rle_data)
+        : rle_data{start_rle_data} {}
+
+    // A type to allow "iteration" over the RLE encoded data in a way that
+    // can be efficiently used to decode the runs of bits for intersection.
+    //
+    // This will return
+    //  - a negative number for the number of consecutive 0's
+    //  - a positive number for the number of consecutive 1's
+    //  - zero if there are no more bits to decode
+    //
+    // Can traverse only a single time.
+    class chunk_enumerator {
+      public:
+        constexpr static auto num_bits = BitSetType::size();
+
+        constexpr chunk_enumerator() : rle_data{nullptr} {}
+
+        constexpr explicit chunk_enumerator(std::byte const *start_rle_data)
+            : rle_data{start_rle_data} {
+            next_chunk();
+        }
+
+        // Get the current chunk of continuous bits. -ve values are 0s
+        // +ve values are 1s. range is -255...255. will return 0 if finished
+        [[nodiscard]] constexpr auto bit_chunk() const -> std::int_fast16_t {
+            return bit_value ? current_run : -current_run;
+        }
+
+        // Advance the bit chunk by `bits` number of bits. This might consume
+        // only a portion of the remain bits in the chunk, skip to the next
+        // chunk or skip over multiple chunks. The current bit position in the
+        // data stream must be provided by caller to avoid reading past end of
+        // RLE data. We rely on the caller to avoid needing multiple bit
+        // counters when there are multiple chunk_enumerators in play.
+        //
+        // Returns the new current_bit (or num_bits if we pass the end of the
+        // data) after the bits are consumed.
+        constexpr auto advance(std::size_t bits, std::size_t current_bit)
+            -> std::size_t {
+            while (bits > 0 && current_bit < num_bits) {
+                // more available than we are consuming?
+                if (bits < current_run) {
+                    current_run -= static_cast<std::uint8_t>(bits);
+                    return current_bit + bits;
+                }
+
+                // consume all the currently available bits
+                // and get the next chunk
+                bits -= current_run;
+                current_bit += current_run;
+                // only load next chunk of we are not at the end
+                if (current_bit < num_bits) {
+                    next_chunk();
+                } else {
+                    // no more bits.
+                    current_run = 0;
+                }
+            }
+
+            return current_bit;
+        }
+
+      private:
+        std::byte const *rle_data;
+        // initial load values so the first next_chunk() call in constructor
+        // starts in the 0's state (pretend we just did some 1s).
+        std::uint8_t current_run{0};
+        bool bit_value{true};
+
+        constexpr void next_chunk() {
+            // skipping the next bit count for a > 255 run?
+            if (*rle_data == std::byte{0}) {
+                // keep same bit_value and skip this encoded byte
+                ++rle_data;
+            } else {
+                // invert bit_value to generate run of opposite bits
+                bit_value = not bit_value;
+            }
+            current_run = static_cast<std::uint8_t>(*rle_data++);
         }
+    };
+
+    [[nodiscard]] constexpr auto make_chunk_enumerator() const
+        -> chunk_enumerator {
+        return chunk_enumerator{rle_data};
     }
 
-    using type = decltype(select_index_storage());
+  private:
+    std::byte const *rle_data;
 };
 
-template <std::size_t N>
-using smallest_storage_type = typename smallest_storage<N>::type;
-
 template <typename BitSetType> struct rle_codec {
     using bitset_type = BitSetType;
-    constexpr static auto num_bits = BitSetType::size();
+    constexpr static auto const num_bits = BitSetType::size();
 
     // assume worst case of each bitmap being alternating bit values
     using max_rle_data_type = stdx::cx_vector<std::byte, num_bits * 2>;
@@ -48,6 +131,7 @@ template <typename BitSetType> struct rle_codec {
         max_rle_data_type data{};
         std::size_t count{0};
         bool last{false};
+
         for (std::size_t bit{0}; bit < num_bits; ++bit) {
             if (bitset[bit] != last) {
                 data.push_back(static_cast<std::byte>(count));
@@ -71,27 +155,140 @@ template <typename BitSetType> struct rle_codec {
     constexpr static auto decode(std::byte const *rle_data) -> bitset_type {
         bitset_type result{};
 
+        auto decoder =
+            rle_decoder<BitSetType>{rle_data}.make_chunk_enumerator();
         std::size_t bit{0};
-        bool bit_val{false};
 
-        // accumulate the correct total number of bits
+        while (bit < decoder.num_bits) {
+            auto chunk_bits = decoder.bit_chunk();
+            if (chunk_bits < 0) {
+                // skip 0's
+                bit =
+                    decoder.advance(static_cast<std::size_t>(-chunk_bits), bit);
+            } else {
+                auto temp_bit = bit;
+                bit =
+                    decoder.advance(static_cast<std::size_t>(chunk_bits), bit);
+
+                // add the 1s
+                while (chunk_bits-- > 0) {
+                    result.set(temp_bit++);
+                }
+            }
+        }
+
+        return result;
+    }
+};
+
+template <typename BitSetType,
+          std::same_as<rle_decoder<BitSetType>>... Decoders>
+    requires(sizeof...(Decoders) > 0)
+struct rle_intersect {
+    using bitset_type = BitSetType;
+    using decoder_type = rle_decoder<BitSetType>;
+    using chunk_type = typename decoder_type::chunk_enumerator;
+    constexpr static auto const num_bits = decoder_type::num_bits;
+    constexpr static auto const num_decoders = sizeof...(Decoders);
+
+    std::array<decoder_type, num_decoders> decoder_list;
+
+    constexpr explicit rle_intersect(Decoders &&...decoders)
+        : decoder_list{std::forward<Decoders>(decoders)...} {}
+
+    // iterate over set bits, passing them to bool (&f)(auto bit_number).
+    // if f returns true, we abort and return true indicating early exit
+    // otherwise return false to indicate no early abort of iteration
+    template <typename F> constexpr auto for_each_until(F &&f) const -> bool {
+        // allocate a set of chunk_enumerators for each decoder
+        // so that we can traverse the bit set intersection
+        stdx::cx_vector<chunk_type, num_decoders> chunks{};
+        for (auto &d : decoder_list) {
+            chunks.push_back(d.make_chunk_enumerator());
+        }
+
+        // advance all chunks by a number of bits and return the new
+        // current bit position
+        auto advance = [&](std::size_t bits,
+                           std::size_t current_bit) -> std::size_t {
+            std::size_t new_current{current_bit};
+            for (auto &c : chunks) {
+                new_current = c.advance(bits, current_bit);
+            }
+            return new_current;
+        };
+
+        // comparison of chunk bit counts
+        auto min_chunk = [](chunk_type const &a, chunk_type const &b) -> bool {
+            return a.bit_chunk() < b.bit_chunk();
+        };
+
+        std::size_t bit{0};
         while (bit < num_bits) {
-            // get the next number of consecutive bits
-            auto cur_bits = static_cast<std::size_t>(*rle_data++);
-            if (bit_val) {
-                // write cur_bits of 1s
-                while (cur_bits-- > 0) {
-                    result.set(bit++);
+            // the min "bit_chunk" item in the chunk list is the smallest run
+            // length chunk value. If that value is -ve this is a run of zeros,
+            // and so we can immediately consume that many bits because
+            // 0 and X = 0. Otherwise, if +ve, it will be the smallest number of
+            // consecutive 1s, and all other chunks will contain more 1s, and so
+            // we can add that many ones to the result.
+            //
+            // NOTE: zero length chunks are ignored as the bits counter will end
+            //       the loop as each rle_decoder finishes at the same time
+            auto min_chunk_bits =
+                (*std::min_element(chunks.begin(), chunks.end(), min_chunk))
+                    .bit_chunk();
+
+            if (min_chunk_bits > 0) {
+                // this will be the minimum number of 1s and all other
+                // chunks must also be 1s.
+                auto temp_bit = bit;
+                bit = advance(static_cast<std::size_t>(min_chunk_bits), bit);
+                while (min_chunk_bits-- > 0) {
+                    // call F, but abort if it indicates abort requested
+                    if (f(temp_bit++)) {
+                        return true; // early abort
+                    }
                 }
             } else {
-                // skip cur_bits of 0s
-                bit += cur_bits;
+                // otherwise it was the maximum run of 0s no need to invoke F
+                bit = advance(static_cast<std::size_t>(-min_chunk_bits), bit);
             }
-            bit_val = !bit_val;
         }
+        return false; // full traversal
+    }
+
+    template <typename F> constexpr auto for_each(F &&f) const -> F {
+        for_each_until([&](auto i) {
+            f(i);
+            return false;
+        });
+        return std::forward<F>(f);
+    }
+
+    template <typename F>
+    friend constexpr auto for_each(F &&f, rle_intersect const &intersect) -> F {
+        return intersect.for_each(std::forward<F>(f));
+    }
+
+    [[nodiscard]] constexpr auto any() const -> bool {
+        // iterate until we find at least a single bit.
+        return for_each_until([](auto /*unused*/) { return true; });
+    }
+
+    [[nodiscard]] constexpr auto none() const -> bool { return not any(); }
+
+    [[nodiscard]] constexpr auto get_bitset() const -> bitset_type {
+        bitset_type result{};
+
+        for_each([&](auto i) { result.set(i); });
 
         return result;
     }
 };
 
+// at least 1 decoder is required
+template <typename Decoder, std::same_as<Decoder>... Others>
+rle_intersect(Decoder d, Others... others)
+    -> rle_intersect<typename Decoder::bitset_type, Decoder, Others...>;
+
 } // namespace msg::detail
diff --git a/include/msg/rle_indexed_builder.hpp b/include/msg/rle_indexed_builder.hpp
index 3de8ffcc..d042ed17 100644
--- a/include/msg/rle_indexed_builder.hpp
+++ b/include/msg/rle_indexed_builder.hpp
@@ -80,10 +80,17 @@ template <typename LockType, std::size_t DataLength> struct rle_storage {
         return codec_type::decode(std::next(data.begin(), idx.offset));
     }
 
+    template <typename FieldType, typename BitSetType, typename OffsetType>
+    constexpr auto
+    decode(rle_index<LockType, FieldType, BitSetType, OffsetType> idx) const
+        -> rle_decoder<BitSetType> {
+        return rle_decoder<BitSetType>{std::next(data.begin(), idx.offset)};
+    }
+
     storage_type data;
 };
 
-// Build the encoded RLE data with a max lenght of MaxDataLen
+// Build the encoded RLE data with a max length of MaxDataLen
 // Take the opportunity to reuse byte sequences where possible.
 template <std::size_t MaxDataLength> struct rle_storage_builder {
     using offset_type = detail::smallest_storage_type<MaxDataLength>;
diff --git a/include/msg/rle_indexed_handler.hpp b/include/msg/rle_indexed_handler.hpp
index d6952f43..bbd42af1 100644
--- a/include/msg/rle_indexed_handler.hpp
+++ b/include/msg/rle_indexed_handler.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <msg/detail/indexed_handler_common.hpp>
+#include <msg/detail/rle_codec.hpp>
 
 #include <stdx/compiler.hpp>
 
@@ -12,10 +13,9 @@ struct rle_indices : IndicesT... {
         : IndicesT{index_args}..., storage{data} {}
 
     constexpr auto operator()(auto const &data) const {
-        // TODO: efficient bitand that doesn't need to materialise full bitset
-
-        // use the index to decode the bitset from storage
-        return (storage.get(this->IndicesT::operator()(data)) & ...);
+        // proxy to allow intersection without materializing a full bitset.
+        return detail::rle_intersect{
+            storage.decode(this->IndicesT::operator()(data))...};
     }
 
     // index entries will map into this storage to decode RLE data
diff --git a/test/msg/detail/rle_codec.cpp b/test/msg/detail/rle_codec.cpp
index c28fb72a..8e7813bd 100644
--- a/test/msg/detail/rle_codec.cpp
+++ b/test/msg/detail/rle_codec.cpp
@@ -1,5 +1,7 @@
 #include <msg/detail/rle_codec.hpp>
 
+#include <stdx/compiler.hpp>
+
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/matchers/catch_matchers_range_equals.hpp>
 
@@ -9,7 +11,7 @@
 using Catch::Matchers::RangeEquals;
 
 namespace {
-auto operator"" _b(unsigned long long int v) -> std::byte {
+CONSTEVAL auto operator"" _b(unsigned long long v) -> std::byte {
     return static_cast<std::byte>(v);
 }
 } // namespace
@@ -24,45 +26,206 @@ TEST_CASE("rle_codec can encode all zeros", "[rle_codec]") {
     using bs = stdx::bitset<10>;
     using codec = rle_codec<bs>;
 
-    CHECK_THAT(codec::encode(bs{}),
-               RangeEquals(std::to_array<std::byte>({10_b})));
+    CHECK_THAT(codec::encode(bs{}), RangeEquals(std::array{10_b}));
 }
 
 TEST_CASE("rle_codec can encode all ones", "[rle_codec]") {
     using bs = stdx::bitset<12>;
     using codec = rle_codec<bs>;
 
-    CHECK_THAT(codec::encode(~bs{}),
-               RangeEquals(std::to_array<std::byte>({0_b, 12_b})));
+    CHECK_THAT(codec::encode(~bs{}), RangeEquals(std::array{0_b, 12_b}));
 }
 
 TEST_CASE("rle_codec can encode all zeros for large bit count", "[rle_codec]") {
     using bs = stdx::bitset<512>;
     using codec = rle_codec<bs>;
 
-    CHECK_THAT(codec::encode(bs{}), RangeEquals(std::to_array<std::byte>(
-                                        {255_b, 0_b, 255_b, 0_b, 2_b})));
+    CHECK_THAT(codec::encode(bs{}),
+               RangeEquals(std::array{255_b, 0_b, 255_b, 0_b, 2_b}));
 }
 
 TEST_CASE("rle_codec can encode all ones for large bit count", "[rle_codec]") {
     using bs = stdx::bitset<512>;
     using codec = rle_codec<bs>;
 
-    CHECK_THAT(codec::encode(~bs{}), RangeEquals(std::to_array<std::byte>(
-                                         {0_b, 255_b, 0_b, 255_b, 0_b, 2_b})));
+    CHECK_THAT(codec::encode(~bs{}),
+               RangeEquals(std::array{0_b, 255_b, 0_b, 255_b, 0_b, 2_b}));
 }
 
 TEST_CASE("rle_codec can encode alternating bits", "[rle_codec]") {
     using bs = stdx::bitset<8>;
     using codec = rle_codec<bs>;
 
-    CHECK_THAT(codec::encode(bs{stdx::place_bits, 0, 2, 4, 6}),
-               RangeEquals(std::to_array<std::byte>(
-                   {0_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b})));
+    CHECK_THAT(
+        codec::encode(bs{stdx::place_bits, 0, 2, 4, 6}),
+        RangeEquals(std::array{0_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b}));
 
     CHECK_THAT(codec::encode(bs{stdx::place_bits, 1, 3, 5, 7}),
-               RangeEquals(std::to_array<std::byte>(
-                   {1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b})));
+               RangeEquals(std::array{1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b}));
+}
+
+TEST_CASE("rle_decoder can iterate chunks 1", "[rle_codec]") {
+    using bs = stdx::bitset<8>;
+    using decoder = rle_decoder<bs>;
+
+    auto rle = std::array{1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b, 1_b};
+
+    decoder dec{rle.data()};
+    auto chunk = dec.make_chunk_enumerator();
+    std::size_t bit{0};
+
+    REQUIRE(chunk.bit_chunk() == -1);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 1);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == -1);
+    REQUIRE(bit == 2);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 3);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == -1);
+    REQUIRE(bit == 4);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 5);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == -1);
+    REQUIRE(bit == 6);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 7);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 0);
+    REQUIRE(bit == 8);
+}
+
+TEST_CASE("rle_decoder can iterate chunks 2", "[rle_codec]") {
+    using bs = stdx::bitset<8>;
+    using decoder = rle_decoder<bs>;
+
+    auto rle = std::array{0_b, 2_b, 2_b, 2_b, 0_b, 2_b};
+    decoder dec{rle.data()};
+    auto chunk = dec.make_chunk_enumerator();
+    std::size_t bit{0};
+
+    REQUIRE(chunk.bit_chunk() == 2);
+
+    bit = chunk.advance(2, bit);
+    REQUIRE(chunk.bit_chunk() == -2);
+    REQUIRE(bit == 2);
+
+    bit = chunk.advance(2, bit);
+    REQUIRE(chunk.bit_chunk() == 2);
+    REQUIRE(bit == 4);
+
+    bit = chunk.advance(2, bit);
+    REQUIRE(chunk.bit_chunk() == 2);
+    REQUIRE(bit == 6);
+
+    bit = chunk.advance(2, bit);
+    REQUIRE(chunk.bit_chunk() == 0);
+    REQUIRE(bit == 8);
+}
+
+TEST_CASE("rle_decoder can iterate chunks 3", "[rle_codec]") {
+    using bs = stdx::bitset<1000>;
+    using decoder = rle_decoder<bs>;
+
+    auto rle = std::array{255_b, 0_b, 255_b, 0_b, 255_b, 235_b};
+    decoder dec{rle.data()};
+    auto chunk = dec.make_chunk_enumerator();
+
+    std::size_t bit{0};
+
+    REQUIRE(chunk.bit_chunk() == -255);
+
+    bit = chunk.advance(510, bit);
+    REQUIRE(chunk.bit_chunk() == -255);
+    REQUIRE(bit == 510);
+
+    bit = chunk.advance(255, bit);
+    REQUIRE(chunk.bit_chunk() == 235);
+    REQUIRE(bit == 765);
+
+    bit = chunk.advance(235, bit);
+    REQUIRE(chunk.bit_chunk() == 0);
+    REQUIRE(bit == 1000);
+}
+
+TEST_CASE("rle_decoder can iterate sub-chunks", "[rle_codec]") {
+    using bs = stdx::bitset<8>;
+    using decoder = rle_decoder<bs>;
+
+    auto rle = std::array{0_b, 2_b, 2_b, 2_b, 0_b, 2_b};
+    decoder dec{rle.data()};
+    auto chunk = dec.make_chunk_enumerator();
+    std::size_t bit{0};
+
+    REQUIRE(chunk.bit_chunk() == 2);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 1);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == -2);
+    REQUIRE(bit == 2);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == -1);
+    REQUIRE(bit == 3);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 2);
+    REQUIRE(bit == 4);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 5);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 2);
+    REQUIRE(bit == 6);
+
+    bit = chunk.advance(1, bit);
+    REQUIRE(chunk.bit_chunk() == 1);
+    REQUIRE(bit == 7);
+
+    bit = chunk.advance(1, bit);
+    // we are now out of bits and should get zero back
+    // and not attempt to read beyond end of rle data
+    REQUIRE(chunk.bit_chunk() == 0);
+    REQUIRE(bit == 8);
+
+    bit = chunk.advance(10, bit);
+    REQUIRE(chunk.bit_chunk() == 0);
+    REQUIRE(bit == 8);
+}
+
+TEST_CASE("rle_decoder stops after num_bits", "[rle_codec]") {
+    using bs = stdx::bitset<8>;
+    using decoder = rle_decoder<bs>;
+
+    // contains extra data which should be ignored
+    auto rle = std::array{0_b, 2_b, 2_b, 2_b, 0_b, 2_b, 255_b, 255_b};
+    decoder dec{rle.data()};
+    auto chunk = dec.make_chunk_enumerator();
+    std::size_t bit{0};
+
+    REQUIRE(chunk.bit_chunk() == 2);
+    bit = chunk.advance(100, bit);
+    REQUIRE(chunk.bit_chunk() == 0);
+    REQUIRE(bit == 8);
 }
 
 TEST_CASE("rle_codec can decode", "[rle_codec]") {
@@ -79,4 +242,100 @@ TEST_CASE("rle_codec can decode", "[rle_codec]") {
     CHECK(codec::decode(codec::encode(bone).cbegin()) == bone);
 }
 
+TEST_CASE("rle_codec can decode multiple zero pads", "[rle_codec]") {
+    using bs = stdx::bitset<2000>;
+    using codec = rle_codec<bs>;
+    auto const b1 = bs{stdx::place_bits, 1, 2, 3, 499, 1999};
+    auto const b2 = bs{stdx::place_bits, 2, 3, 5, 7, 11, 13, 17, 400, 1999};
+    auto const bzero = bs{};
+    auto const bone = ~bs{};
+
+    CHECK(codec::decode(codec::encode(b1).cbegin()) == b1);
+    CHECK(codec::decode(codec::encode(b2).cbegin()) == b2);
+    CHECK(codec::decode(codec::encode(bzero).cbegin()) == bzero);
+    CHECK(codec::decode(codec::encode(bone).cbegin()) == bone);
+}
+
+TEST_CASE("rle_intersect works with 1 rle bitset", "[rle_intersect]") {
+    using bs = stdx::bitset<8>;
+    using codec = rle_codec<bs>;
+    using decoder_t = rle_decoder<bs>;
+
+    auto rle_1 = codec::encode(bs{"00011100"});
+
+    auto dec_1 = decoder_t{rle_1.cbegin()};
+
+    auto intersection =
+        rle_intersect{std::forward<decoder_t>(dec_1)}.get_bitset();
+
+    CHECK(intersection == bs{"00011100"});
+}
+
+TEST_CASE("rle_intersect works with 2 rle bitsets", "[rle_intersect]") {
+    using bs = stdx::bitset<8>;
+    using codec = rle_codec<bs>;
+    using decoder_t = rle_decoder<bs>;
+
+    auto rle_1 = codec::encode(bs{"00011100"});
+    auto rle_2 = codec::encode(bs{"11110101"});
+
+    auto dec_1 = decoder_t{rle_1.cbegin()};
+    auto dec_2 = decoder_t{rle_2.cbegin()};
+
+    auto intersection = rle_intersect{std::forward<decoder_t>(dec_1),
+                                      std::forward<decoder_t>(dec_2)}
+                            .get_bitset();
+
+    CHECK(intersection == bs{"00010100"});
+}
+
+TEST_CASE("rle_intersect works with many rle bitsets", "[rle_intersect]") {
+    using bs = stdx::bitset<32>;
+    using codec = rle_codec<bs>;
+    using decoder_t = rle_decoder<bs>;
+
+    auto rle_1 = codec::encode(bs{"11111111111111110000000000000000"});
+    auto rle_2 = codec::encode(bs{"11111111000000001111111100000000"});
+    auto rle_3 = codec::encode(bs{"11110000111100001111000011110000"});
+    auto rle_4 = codec::encode(bs{"11001100110011001100110011001100"});
+    auto rle_5 = codec::encode(bs{"10101010101010101010101010101010"});
+    auto expected = /* pad  */ bs{"10000000000000000000000000000000"};
+
+    auto dec_1 = decoder_t{rle_1.cbegin()};
+    auto dec_2 = decoder_t{rle_2.cbegin()};
+    auto dec_3 = decoder_t{rle_3.cbegin()};
+    auto dec_4 = decoder_t{rle_4.cbegin()};
+    auto dec_5 = decoder_t{rle_5.cbegin()};
+
+    auto intersection = rle_intersect{std::forward<decoder_t>(dec_1),
+                                      std::forward<decoder_t>(dec_2),
+                                      std::forward<decoder_t>(dec_3),
+                                      std::forward<decoder_t>(dec_4),
+                                      std::forward<decoder_t>(dec_5)}
+                            .get_bitset();
+
+    CHECK(intersection == expected);
+}
+
+TEST_CASE("rle_intersect for_each()", "[rle_intersect]") {
+    using bs = stdx::bitset<8>;
+    using codec = rle_codec<bs>;
+    using decoder_t = rle_decoder<bs>;
+
+    auto rle_1 = codec::encode(bs{"00011100"});
+    auto rle_2 = codec::encode(bs{"11110101"});
+    auto expected = /* pad  */ bs{"00010100"};
+
+    auto dec_1 = decoder_t{rle_1.cbegin()};
+    auto dec_2 = decoder_t{rle_2.cbegin()};
+
+    auto intersection = rle_intersect{std::forward<decoder_t>(dec_1),
+                                      std::forward<decoder_t>(dec_2)};
+
+    bs result{};
+    intersection.for_each([&](auto i) { result.set(i); });
+
+    CHECK(result == expected);
+}
+
 } // namespace msg::detail