From dfda30328457c833115d78f59e83780247d50911 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 11:54:21 +0200
Subject: [PATCH 01/13] Re-factor, simplify, and optimise load balancing. 1/n

- Remove redundant MPI call for global gid list
- Make GJ table local
- Extract connection table builing.
---
 arbor/partition_load_balance.cpp | 97 ++++++++++++--------------------
 1 file changed, 37 insertions(+), 60 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 8718c743d1..13eff82721 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -20,13 +20,36 @@
 
 namespace arb {
 
-ARB_ARBOR_API domain_decomposition partition_load_balance(
-    const recipe& rec,
-    context ctx,
-    const partition_hint_map& hint_map)
-{
+namespace {
+// Build global GJ connectivity table such that
+// * table[gid] is the set of all gids connected to gid via a GJ
+// * iff A in table[B], then B in table[A]
+auto build_global_gj_connection_table(const recipe& rec) {
+    std::unordered_map<cell_gid_type, std::unordered_set<cell_gid_type>> res;
+    for (cell_gid_type gid = 0; gid < rec.num_cells(); ++gid) {
+        for (const auto& gj: rec.gap_junctions_on(gid)) {
+            res[gid].insert(gj.peer.gid);
+        }
+    }
+
+    // Make all gj_connections bidirectional.
+    for (auto& [gid, local_conns]: res) {
+        for (auto peer: local_conns) {
+            auto& peer_conns = res[peer];
+            if (!peer_conns.count(gid)) peer_conns.insert(gid);
+        }
+    }
+    return res;
+}
+}
+
+ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
+                                                          context ctx,
+                                                          const partition_hint_map& hint_map) {
     using util::make_span;
 
+    const auto global_gj_connection_table = build_global_gj_connection_table(rec);
+
     const auto& dist = ctx->distributed;
     unsigned num_domains = dist->size();
     unsigned domain_id = dist->id();
@@ -36,56 +59,17 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(
     auto dom_size = [&](unsigned dom) -> cell_gid_type {
         const cell_gid_type B = num_global_cells/num_domains;
         const cell_gid_type R = num_global_cells - num_domains*B;
-        return B + (dom<R);
+        return B + (dom < R);
     };
 
     // Global load balance
-
     std::vector<cell_gid_type> gid_divisions;
-    auto gid_part = make_partition(
-        gid_divisions, transform_view(make_span(num_domains), dom_size));
-
-    // Global gj_connection table
+    auto gid_part = make_partition(gid_divisions, transform_view(make_span(num_domains), dom_size));
 
-    // Generate a local gj_connection table.
-    // The table is indexed by the index of the target gid in the gid_part of that domain.
-    // If gid_part[domain_id] = [a, b); local_gj_connection of gid `x` is at index `x-a`.
-    const auto dom_range = gid_part[domain_id];
-    std::vector<std::vector<cell_gid_type>> local_gj_connection_table(dom_range.second-dom_range.first);
-    for (auto gid: make_span(gid_part[domain_id])) {
-        for (const auto& c: rec.gap_junctions_on(gid)) {
-            local_gj_connection_table[gid-dom_range.first].push_back(c.peer.gid);
-        }
-    }
-    // Sort the gj connections of each local cell.
-    for (auto& gid_conns: local_gj_connection_table) {
-        util::sort(gid_conns);
-    }
 
-    // Gather the global gj_connection table.
-    // The global gj_connection table after gathering is indexed by gid.
-    auto global_gj_connection_table = dist->gather_gj_connections(local_gj_connection_table);
-
-    // Make all gj_connections bidirectional.
-    std::vector<std::unordered_set<cell_gid_type>> missing_peers(global_gj_connection_table.size());
-    for (auto gid: make_span(global_gj_connection_table.size())) {
-        const auto& local_conns = global_gj_connection_table[gid];
-        for (auto peer: local_conns) {
-            auto& peer_conns = global_gj_connection_table[peer];
-            // If gid is not in the peer connection table insert it into the
-            // missing_peers set
-            if (!std::binary_search(peer_conns.begin(), peer_conns.end(), gid)) {
-                missing_peers[peer].insert(gid);
-            }
-        }
-    }
-    // Append the missing peers into the global_gj_connections table
-    for (unsigned i = 0; i < global_gj_connection_table.size(); ++i) {
-        std::move(missing_peers[i].begin(), missing_peers[i].end(), std::back_inserter(global_gj_connection_table[i]));
-    }
     // Local load balance
 
-    std::vector<std::vector<cell_gid_type>> super_cells; //cells connected by gj
+    std::vector<std::vector<cell_gid_type>> super_cells; // cells connected by gj
     std::vector<cell_gid_type> reg_cells; //independent cells
 
     // Map to track visited cells (cells that already belong to a group)
@@ -94,7 +78,7 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(
     // Connected components algorithm using BFS
     std::queue<cell_gid_type> q;
     for (auto gid: make_span(gid_part[domain_id])) {
-        if (!global_gj_connection_table[gid].empty()) {
+        if (global_gj_connection_table.count(gid)) {
             // If cell hasn't been visited yet, must belong to new super_cell
             // Perform BFS starting from that cell
             if (!visited.count(gid)) {
@@ -106,18 +90,16 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(
                     q.pop();
                     cg.push_back(element);
                     // Adjacency list
-                    for (const auto& peer: global_gj_connection_table[element]) {
-                        if (visited.insert(peer).second) {
-                            q.push(peer);
-                        }
+                    for (const auto& peer: global_gj_connection_table.at(element)) {
+                        if (visited.insert(peer).second) q.push(peer);
                     }
                 }
-                super_cells.push_back(cg);
+                super_cells.emplace_back(std::move(cg));
             }
         }
         else {
             // If cell has no gap_junctions, put in separate group of independent cells
-            reg_cells.push_back(gid);
+            reg_cells.emplace_back(gid);
         }
     }
 
@@ -223,12 +205,7 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(
         }
     }
 
-    // Exchange gid list with all other nodes
-    // global all-to-all to gather a local copy of the global gid list on each node.
-    auto global_gids = dist->gather_gids(local_gids);
-
-    return domain_decomposition(rec, ctx, groups);
+    return {rec, ctx, groups};
 }
-
 } // namespace arb
 

From 204c33f43df47e899bf6197e2e7463d98687f0e8 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 12:02:56 +0200
Subject: [PATCH 02/13] Remove obsolete MPI methods.

---
 arbor/communication/dry_run_context.cpp | 20 --------------------
 arbor/communication/mpi_context.cpp     | 10 ----------
 arbor/distributed_context.hpp           | 14 --------------
 arbor/partition_load_balance.cpp        |  3 ++-
 4 files changed, 2 insertions(+), 45 deletions(-)

diff --git a/arbor/communication/dry_run_context.cpp b/arbor/communication/dry_run_context.cpp
index affcd77660..1a2f3ea2b1 100644
--- a/arbor/communication/dry_run_context.cpp
+++ b/arbor/communication/dry_run_context.cpp
@@ -74,26 +74,6 @@ struct dry_run_context_impl {
         return gathered_vector<cell_gid_type>(std::move(gathered_gids), std::move(partition));
     }
 
-    std::vector<std::vector<cell_gid_type>>
-    gather_gj_connections(const std::vector<std::vector<cell_gid_type>> & local_connections) const {
-        auto local_size = local_connections.size();
-        std::vector<std::vector<cell_gid_type>> global_connections;
-        global_connections.reserve(local_size*num_ranks_);
-
-        for (unsigned i = 0; i < num_ranks_; i++) {
-            util::append(global_connections, local_connections);
-        }
-
-        for (unsigned i = 0; i < num_ranks_; i++) {
-            for (unsigned j = i*local_size; j < (i+1)*local_size; j++){
-                for (auto& conn_gid: global_connections[j]) {
-                    conn_gid += num_cells_per_tile_*i;
-                }
-            }
-        }
-        return global_connections;
-    }
-
     cell_label_range gather_cell_label_range(const cell_label_range& local_ranges) const {
         cell_label_range global_ranges;
         for (unsigned i = 0; i < num_ranks_; i++) {
diff --git a/arbor/communication/mpi_context.cpp b/arbor/communication/mpi_context.cpp
index 6019e76065..f2d31a51f6 100644
--- a/arbor/communication/mpi_context.cpp
+++ b/arbor/communication/mpi_context.cpp
@@ -53,11 +53,6 @@ struct mpi_context_impl {
         return mpi::gather_all_with_partition(local_gids, comm_);
     }
 
-    std::vector<std::vector<cell_gid_type>>
-    gather_gj_connections(const std::vector<std::vector<cell_gid_type>>& local_connections) const {
-        return mpi::gather_all(local_connections, comm_);
-    }
-
     cell_label_range gather_cell_label_range(const cell_label_range& local_ranges) const {
         std::vector<cell_size_type> sizes;
         std::vector<cell_tag_type> labels;
@@ -141,11 +136,6 @@ struct remote_context_impl {
     gathered_vector<cell_gid_type>
     gather_gids(const std::vector<cell_gid_type>& local_gids) const { return mpi_.gather_gids(local_gids); }
 
-    std::vector<std::vector<cell_gid_type>>
-    gather_gj_connections(const std::vector<std::vector<cell_gid_type>>& local_connections) const {
-        return mpi_.gather_gj_connections(local_connections);
-    }
-
     cell_label_range gather_cell_label_range(const cell_label_range& local_ranges) const {
         return mpi_.gather_cell_label_range(local_ranges);
     }
diff --git a/arbor/distributed_context.hpp b/arbor/distributed_context.hpp
index c988177d33..76e8f4c0a9 100644
--- a/arbor/distributed_context.hpp
+++ b/arbor/distributed_context.hpp
@@ -72,10 +72,6 @@ class distributed_context {
         return impl_->gather_gids(local_gids);
     }
 
-    gj_connection_vector gather_gj_connections(const gj_connection_vector& local_connections) const {
-        return impl_->gather_gj_connections(local_connections);
-    }
-
     cell_label_range gather_cell_label_range(const cell_label_range& local_ranges) const {
         return impl_->gather_cell_label_range(local_ranges);
     }
@@ -117,8 +113,6 @@ class distributed_context {
         remote_gather_spikes(const spike_vector& local_spikes) const = 0;
         virtual gathered_vector<cell_gid_type>
         gather_gids(const gid_vector& local_gids) const = 0;
-        virtual gj_connection_vector
-        gather_gj_connections(const gj_connection_vector& local_connections) const = 0;
         virtual cell_label_range
         gather_cell_label_range(const cell_label_range& local_ranges) const = 0;
         virtual cell_labels_and_gids
@@ -154,10 +148,6 @@ class distributed_context {
         gather_gids(const gid_vector& local_gids) const override {
             return wrapped.gather_gids(local_gids);
         }
-        std::vector<std::vector<cell_gid_type>>
-        gather_gj_connections(const gj_connection_vector& local_connections) const override {
-            return wrapped.gather_gj_connections(local_connections);
-        }
         cell_label_range
         gather_cell_label_range(const cell_label_range& local_ranges) const override {
             return wrapped.gather_cell_label_range(local_ranges);
@@ -217,10 +207,6 @@ struct local_context {
     }
     void remote_ctrl_send_continue(const epoch&) const {}
     void remote_ctrl_send_done() const {}
-    std::vector<std::vector<cell_gid_type>>
-    gather_gj_connections(const std::vector<std::vector<cell_gid_type>>& local_connections) const {
-        return local_connections;
-    }
     cell_label_range
     gather_cell_label_range(const cell_label_range& local_ranges) const {
         return local_ranges;
diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 13eff82721..ea2414edf7 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -122,6 +122,7 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
     };
     std::vector<cell_gid_type> local_gids;
     std::unordered_map<cell_kind, std::vector<cell_identifier>> kind_lists;
+
     for (auto gid: reg_cells) {
         local_gids.push_back(gid);
         kind_lists[rec.get_cell_kind(gid)].push_back({gid, false});
@@ -138,7 +139,6 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
         kind_lists[kind].push_back({i, true});
     }
 
-
     // Create a flat vector of the cell kinds present on this node,
     // partitioned such that kinds for which GPU implementation are
     // listed before the others. This is a very primitive attempt at
@@ -195,6 +195,7 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
                     group_elements.push_back(gid);
                 }
             }
+
             if (group_elements.size()>=group_size) {
                 groups.emplace_back(k, std::move(group_elements), backend);
                 group_elements.clear();

From c97e9c5c1a2781da0bc45494276bc1caab5b0662 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 13:01:57 +0200
Subject: [PATCH 03/13] Remove super/regular cell split. Much simplification
 ensues.

---
 arbor/partition_load_balance.cpp | 117 +++++++++++++------------------
 1 file changed, 50 insertions(+), 67 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index ea2414edf7..99ac90d38d 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -41,20 +41,11 @@ auto build_global_gj_connection_table(const recipe& rec) {
     }
     return res;
 }
-}
-
-ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
-                                                          context ctx,
-                                                          const partition_hint_map& hint_map) {
-    using util::make_span;
-
-    const auto global_gj_connection_table = build_global_gj_connection_table(rec);
 
+auto make_local_gid_range(context ctx, cell_gid_type num_global_cells) {
     const auto& dist = ctx->distributed;
     unsigned num_domains = dist->size();
     unsigned domain_id = dist->id();
-    const bool gpu_avail = ctx->gpu->has_gpu();
-    auto num_global_cells = rec.num_cells();
 
     auto dom_size = [&](unsigned dom) -> cell_gid_type {
         const cell_gid_type B = num_global_cells/num_domains;
@@ -62,22 +53,24 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
         return B + (dom < R);
     };
 
-    // Global load balance
     std::vector<cell_gid_type> gid_divisions;
-    auto gid_part = make_partition(gid_divisions, transform_view(make_span(num_domains), dom_size));
-
+    auto gid_part = util::make_partition(gid_divisions,
+                                         util::transform_view(util::make_span(num_domains), dom_size));
 
-    // Local load balance
+    return gid_part[domain_id];
+}
 
+auto build_components(const std::unordered_map<cell_gid_type, std::unordered_set<cell_gid_type>>& global_gj_connection_table,
+                      std::pair<cell_gid_type, cell_gid_type> local_gid_range) {
     std::vector<std::vector<cell_gid_type>> super_cells; // cells connected by gj
-    std::vector<cell_gid_type> reg_cells; //independent cells
+    std::vector<std::vector<cell_gid_type>> res;
 
-    // Map to track visited cells (cells that already belong to a group)
+    // track visited cells (cells that already belong to a group)
     std::unordered_set<cell_gid_type> visited;
 
     // Connected components algorithm using BFS
     std::queue<cell_gid_type> q;
-    for (auto gid: make_span(gid_part[domain_id])) {
+    for (auto gid: util::make_span(local_gid_range)) {
         if (global_gj_connection_table.count(gid)) {
             // If cell hasn't been visited yet, must belong to new super_cell
             // Perform BFS starting from that cell
@@ -98,45 +91,39 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
             }
         }
         else {
-            // If cell has no gap_junctions, put in separate group of independent cells
-            reg_cells.emplace_back(gid);
+            // If cell has no gap_junctions, put in front
+            res.push_back({gid});
         }
     }
 
     // Sort super_cell groups and only keep those where the first element in the group belongs to domain
-    super_cells.erase(std::remove_if(super_cells.begin(), super_cells.end(),
-            [gid_part, domain_id](std::vector<cell_gid_type>& cg)
-            {
-                std::sort(cg.begin(), cg.end());
-                return cg.front() < gid_part[domain_id].first;
-            }), super_cells.end());
-
-    // Collect local gids that belong to this rank, and sort gids into kind lists
-    // kind_lists maps a cell_kind to a vector of either:
-    // 1. gids of regular cells (in reg_cells)
-    // 2. indices of supercells (in super_cells)
-
-    struct cell_identifier {
-        cell_gid_type id;
-        bool is_super_cell;
-    };
-    std::vector<cell_gid_type> local_gids;
-    std::unordered_map<cell_kind, std::vector<cell_identifier>> kind_lists;
-
-    for (auto gid: reg_cells) {
-        local_gids.push_back(gid);
-        kind_lists[rec.get_cell_kind(gid)].push_back({gid, false});
+    for (auto& sc: super_cells) {
+        std::sort(sc.begin(), sc.end());
+        if (!sc.empty() && sc.front() >= local_gid_range.first) res.push_back(sc);
     }
+    return res;
+}
+}
 
-    for (unsigned i = 0; i < super_cells.size(); i++) {
-        auto kind = rec.get_cell_kind(super_cells[i].front());
-        for (auto gid: super_cells[i]) {
-            if (rec.get_cell_kind(gid) != kind) {
-                throw gj_kind_mismatch(gid, super_cells[i].front());
-            }
+ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
+                                                          context ctx,
+                                                          const partition_hint_map& hint_map) {
+    const auto global_gj_connection_table = build_global_gj_connection_table(rec);
+    const auto local_gid_range = make_local_gid_range(ctx, rec.num_cells());
+    const auto components = build_components(global_gj_connection_table, local_gid_range);
+
+    std::vector<cell_gid_type> local_gids;
+    std::unordered_map<cell_kind, std::vector<cell_gid_type>> kind_lists;
+
+    for (auto idx: util::make_span(components.size())) {
+        const auto& component = components[idx];
+        const auto& first_gid  = component.front();
+        auto kind = rec.get_cell_kind(first_gid);
+        for (auto gid: component) {
+            if (rec.get_cell_kind(gid) != kind) throw gj_kind_mismatch(gid, first_gid);
             local_gids.push_back(gid);
         }
-        kind_lists[kind].push_back({i, true});
+        kind_lists[kind].push_back((cell_gid_type) idx);
     }
 
     // Create a flat vector of the cell kinds present on this node,
@@ -150,13 +137,11 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
     // of cell group updates according to rules such as the back end on
     // which the cell group is running.
 
-    auto has_gpu_backend = [&ctx](cell_kind c) {
-        return cell_kind_supported(c, backend_kind::gpu, *ctx);
-    };
+    auto has_gpu_backend = [&ctx](cell_kind c) { return ctx->gpu->has_gpu() && cell_kind_supported(c, backend_kind::gpu, *ctx); };
 
     std::vector<cell_kind> kinds;
     for (auto l: kind_lists) {
-        kinds.push_back(cell_kind(l.first));
+        kinds.push_back(l.first);
     }
     std::partition(kinds.begin(), kinds.end(), has_gpu_backend);
 
@@ -166,36 +151,34 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
         if (auto opt_hint = util::value_by_key(hint_map, k)) {
             hint = opt_hint.value();
             if(!hint.cpu_group_size) {
-                throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested cpu_cell_group size of {}", k, hint.cpu_group_size));
+                throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested cpu_cell_group size of {}",
+                                                         k, hint.cpu_group_size));
             }
             if(hint.prefer_gpu && !hint.gpu_group_size) {
-                throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested gpu_cell_group size of {}", k, hint.gpu_group_size));
+                throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested gpu_cell_group size of {}",
+                                                         k, hint.gpu_group_size));
             }
         }
 
         backend_kind backend = backend_kind::multicore;
         std::size_t group_size = hint.cpu_group_size;
 
-        if (hint.prefer_gpu && gpu_avail && has_gpu_backend(k)) {
+        if (hint.prefer_gpu && has_gpu_backend(k)) {
             backend = backend_kind::gpu;
             group_size = hint.gpu_group_size;
         }
 
         std::vector<cell_gid_type> group_elements;
-        // group_elements are sorted such that the gids of all members of a super_cell are consecutive.
+        // group_elements are sorted such that the gids of all members of a component are consecutive.
         for (auto cell: kind_lists[k]) {
-            if (!cell.is_super_cell) {
-                group_elements.push_back(cell.id);
-            } else {
-                if (group_elements.size() + super_cells[cell.id].size() > group_size && !group_elements.empty()) {
-                    groups.emplace_back(k, std::move(group_elements), backend);
-                    group_elements.clear();
-                }
-                for (auto gid: super_cells[cell.id]) {
-                    group_elements.push_back(gid);
-                }
+            const auto& component = components[cell];
+            if (group_elements.size() + component.size() > group_size && !group_elements.empty()) {
+                groups.emplace_back(k, std::move(group_elements), backend);
+                group_elements.clear();
+            }
+            for (auto gid: component) {
+                group_elements.push_back(gid);
             }
-
             if (group_elements.size()>=group_size) {
                 groups.emplace_back(k, std::move(group_elements), backend);
                 group_elements.clear();

From 9fa29bd3fc5617694ee1eb07b84624c374f5c6a3 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 13:39:09 +0200
Subject: [PATCH 04/13] Simplify range computation.

---
 arbor/partition_load_balance.cpp | 81 +++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 99ac90d38d..8d1988cd78 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -21,11 +21,17 @@
 namespace arb {
 
 namespace {
+using gj_connection_set   = std::unordered_set<cell_gid_type>;
+using gj_connection_table = std::unordered_map<cell_gid_type, gj_connection_set>;
+using gid_range           = std::pair<cell_gid_type, cell_gid_type>;
+
 // Build global GJ connectivity table such that
 // * table[gid] is the set of all gids connected to gid via a GJ
 // * iff A in table[B], then B in table[A]
 auto build_global_gj_connection_table(const recipe& rec) {
-    std::unordered_map<cell_gid_type, std::unordered_set<cell_gid_type>> res;
+    gj_connection_table res;
+
+    // Collect all explicit GJ connections
     for (cell_gid_type gid = 0; gid < rec.num_cells(); ++gid) {
         for (const auto& gj: rec.gap_junctions_on(gid)) {
             res[gid].insert(gj.peer.gid);
@@ -34,46 +40,58 @@ auto build_global_gj_connection_table(const recipe& rec) {
 
     // Make all gj_connections bidirectional.
     for (auto& [gid, local_conns]: res) {
-        for (auto peer: local_conns) {
+         for (auto peer: local_conns) {
             auto& peer_conns = res[peer];
             if (!peer_conns.count(gid)) peer_conns.insert(gid);
         }
     }
+
     return res;
 }
 
+// compute range of gids for the local domain, such that the first (= num_cells
+// % num_dom) domains get an extra element.
 auto make_local_gid_range(context ctx, cell_gid_type num_global_cells) {
     const auto& dist = ctx->distributed;
     unsigned num_domains = dist->size();
     unsigned domain_id = dist->id();
-
-    auto dom_size = [&](unsigned dom) -> cell_gid_type {
-        const cell_gid_type B = num_global_cells/num_domains;
-        const cell_gid_type R = num_global_cells - num_domains*B;
-        return B + (dom < R);
-    };
-
-    std::vector<cell_gid_type> gid_divisions;
-    auto gid_part = util::make_partition(gid_divisions,
-                                         util::transform_view(util::make_span(num_domains), dom_size));
-
-    return gid_part[domain_id];
+    // normal block size
+    auto block = num_global_cells/num_domains;
+    // domains that need an extra element
+    auto extra = num_global_cells - num_domains*block;
+    // now compute the range
+    if (domain_id < extra) {
+        // all previous domains, incl ours, have an extra element
+        auto beg = domain_id*(block + 1);
+        auto end = beg + block + 1;
+        return std::make_pair(beg, end);
+    }
+    else {
+        // in this case the first `extra` domains added an extra element and the
+        // rest has size `block`
+        auto beg = extra + domain_id*block;
+        auto end = beg + block;
+        return std::make_pair(beg, end);
+    }
 }
 
-auto build_components(const std::unordered_map<cell_gid_type, std::unordered_set<cell_gid_type>>& global_gj_connection_table,
-                      std::pair<cell_gid_type, cell_gid_type> local_gid_range) {
-    std::vector<std::vector<cell_gid_type>> super_cells; // cells connected by gj
+// build the list of components for the local domain, where a component is a list of
+// cell gids such that
+// * the smallest gid in the list is in the local_gid_range
+// * all gids that are connected to the smallest gid are also in the list
+// * all gids w/o GJ connections come first (for historical reasons!?)
+auto build_components(const gj_connection_table& global_gj_connection_table,
+                      gid_range local_gid_range) {
+    // cells connected by gj
+    std::vector<std::vector<cell_gid_type>> super_cells;
     std::vector<std::vector<cell_gid_type>> res;
-
     // track visited cells (cells that already belong to a group)
-    std::unordered_set<cell_gid_type> visited;
-
-    // Connected components algorithm using BFS
+    gj_connection_set visited;
+    // Connected components via BFS
     std::queue<cell_gid_type> q;
     for (auto gid: util::make_span(local_gid_range)) {
         if (global_gj_connection_table.count(gid)) {
-            // If cell hasn't been visited yet, must belong to new super_cell
-            // Perform BFS starting from that cell
+            // If cell hasn't been visited yet, must belong to new component
             if (!visited.count(gid)) {
                 visited.insert(gid);
                 std::vector<cell_gid_type> cg;
@@ -91,15 +109,15 @@ auto build_components(const std::unordered_map<cell_gid_type, std::unordered_set
             }
         }
         else {
-            // If cell has no gap_junctions, put in front
             res.push_back({gid});
         }
     }
 
-    // Sort super_cell groups and only keep those where the first element in the group belongs to domain
-    for (auto& sc: super_cells) {
+    // Sort super_cell groups and only keep those where the first element in the
+    // group belongs to our domain
+    for (auto sc: super_cells) {
         std::sort(sc.begin(), sc.end());
-        if (!sc.empty() && sc.front() >= local_gid_range.first) res.push_back(sc);
+        if (!sc.empty() && sc.front() >= local_gid_range.first) res.emplace_back(std::move(sc));
     }
     return res;
 }
@@ -138,11 +156,8 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
     // which the cell group is running.
 
     auto has_gpu_backend = [&ctx](cell_kind c) { return ctx->gpu->has_gpu() && cell_kind_supported(c, backend_kind::gpu, *ctx); };
-
     std::vector<cell_kind> kinds;
-    for (auto l: kind_lists) {
-        kinds.push_back(l.first);
-    }
+    for (const auto& [kind, _lint]: kind_lists) kinds.push_back(kind);
     std::partition(kinds.begin(), kinds.end(), has_gpu_backend);
 
     std::vector<group_description> groups;
@@ -150,11 +165,11 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
         partition_hint hint;
         if (auto opt_hint = util::value_by_key(hint_map, k)) {
             hint = opt_hint.value();
-            if(!hint.cpu_group_size) {
+            if (!hint.cpu_group_size) {
                 throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested cpu_cell_group size of {}",
                                                          k, hint.cpu_group_size));
             }
-            if(hint.prefer_gpu && !hint.gpu_group_size) {
+            if (hint.prefer_gpu && !hint.gpu_group_size) {
                 throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested gpu_cell_group size of {}",
                                                          k, hint.gpu_group_size));
             }

From b4e92029eb774f9d4981e6c8fa7cfae309ceb377 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:13:09 +0200
Subject: [PATCH 05/13] More clean-up, save on RSS.

- add group parameters struct to bundle info
- coral temporary structures into their own scopes to avoid RSS growth.
---
 arbor/partition_load_balance.cpp | 119 +++++++++++++++++--------------
 1 file changed, 66 insertions(+), 53 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 8d1988cd78..9cf1b2c0dc 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -121,14 +121,65 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
     }
     return res;
 }
+
+// Figure what backend and group size to use
+auto get_backend(cell_kind k, const partition_hint_map& hint_map, bool has_gpu) {
+    const auto& hint = util::value_by_key_or(hint_map, k, {});
+    if (!hint.cpu_group_size) {
+        throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested cpu_cell_group size of {}",
+                                                 k, hint.cpu_group_size));
+    }
+    if (hint.prefer_gpu && !hint.gpu_group_size) {
+        throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested gpu_cell_group size of {}",
+                                                 k, hint.gpu_group_size));
+    }
+    if (hint.prefer_gpu && has_gpu) return std::make_pair(backend_kind::gpu, hint.gpu_group_size);
+    return std::make_pair(backend_kind::multicore, hint.cpu_group_size);
+}
+
+struct group_parameters {
+    cell_kind kind;
+    bool has_gpu;
+    backend_kind backend;
+    size_t size;
+};
+
+// Create a flat vector of the cell kinds present on this node, partitioned such
+// that kinds for which GPU implementation are listed before the others. This is
+// a very primitive attempt at scheduling; the cell_groups that run on the GPU
+// will be executed before other cell_groups, which is likely to be more
+// efficient.
+//
+// TODO: This creates an dependency between the load balancer and the threading
+// internals. We need support for setting the priority of cell group updates
+// according to rules such as the back end on which the cell group is running.
+auto build_group_parameters(context ctx,
+                            const partition_hint_map& hint_map,
+                            const std::unordered_map<cell_kind, std::vector<cell_gid_type>>& kind_lists) {
+    std::vector<group_parameters> res;
+    for (const auto& [kind, _gids]: kind_lists) {
+        auto has_gpu = ctx->gpu->has_gpu() && cell_kind_supported(kind, backend_kind::gpu, *ctx);
+        const auto& [backend, group_size] = get_backend(kind, hint_map, has_gpu);
+        res.push_back({kind, has_gpu, backend, group_size});
+    }
+    std::partition(res.begin(), res.end(), [](const auto& p) { return p.has_gpu; });
+    return res;
+}
+
+// Build the list of GJ-connected cells local to this domain.
+// NOTE We put this into its own function to avoid increasing RSS.
+auto build_local_components(const recipe& rec, context ctx) {
+    const auto global_gj_connection_table = build_global_gj_connection_table(rec);
+    const auto local_gid_range = make_local_gid_range(ctx, rec.num_cells());
+    return build_components(global_gj_connection_table, local_gid_range);
+}
+
 }
 
 ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
                                                           context ctx,
                                                           const partition_hint_map& hint_map) {
-    const auto global_gj_connection_table = build_global_gj_connection_table(rec);
-    const auto local_gid_range = make_local_gid_range(ctx, rec.num_cells());
-    const auto components = build_components(global_gj_connection_table, local_gid_range);
+    const auto components = build_local_components(rec, ctx);
 
     std::vector<cell_gid_type> local_gids;
     std::unordered_map<cell_kind, std::vector<cell_gid_type>> kind_lists;
@@ -144,64 +195,26 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
         kind_lists[kind].push_back((cell_gid_type) idx);
     }
 
-    // Create a flat vector of the cell kinds present on this node,
-    // partitioned such that kinds for which GPU implementation are
-    // listed before the others. This is a very primitive attempt at
-    // scheduling; the cell_groups that run on the GPU will be executed
-    // before other cell_groups, which is likely to be more efficient.
-    //
-    // TODO: This creates an dependency between the load balancer and
-    // the threading internals. We need support for setting the priority
-    // of cell group updates according to rules such as the back end on
-    // which the cell group is running.
-
-    auto has_gpu_backend = [&ctx](cell_kind c) { return ctx->gpu->has_gpu() && cell_kind_supported(c, backend_kind::gpu, *ctx); };
-    std::vector<cell_kind> kinds;
-    for (const auto& [kind, _lint]: kind_lists) kinds.push_back(kind);
-    std::partition(kinds.begin(), kinds.end(), has_gpu_backend);
+    auto kinds = build_group_parameters(ctx, hint_map, kind_lists);
 
     std::vector<group_description> groups;
-    for (auto k: kinds) {
-        partition_hint hint;
-        if (auto opt_hint = util::value_by_key(hint_map, k)) {
-            hint = opt_hint.value();
-            if (!hint.cpu_group_size) {
-                throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested cpu_cell_group size of {}",
-                                                         k, hint.cpu_group_size));
-            }
-            if (hint.prefer_gpu && !hint.gpu_group_size) {
-                throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested gpu_cell_group size of {}",
-                                                         k, hint.gpu_group_size));
-            }
-        }
-
-        backend_kind backend = backend_kind::multicore;
-        std::size_t group_size = hint.cpu_group_size;
-
-        if (hint.prefer_gpu && has_gpu_backend(k)) {
-            backend = backend_kind::gpu;
-            group_size = hint.gpu_group_size;
-        }
-
+    for (const auto& params: kinds) {
         std::vector<cell_gid_type> group_elements;
         // group_elements are sorted such that the gids of all members of a component are consecutive.
-        for (auto cell: kind_lists[k]) {
+        for (auto cell: kind_lists[params.kind]) {
             const auto& component = components[cell];
-            if (group_elements.size() + component.size() > group_size && !group_elements.empty()) {
-                groups.emplace_back(k, std::move(group_elements), backend);
-                group_elements.clear();
-            }
-            for (auto gid: component) {
-                group_elements.push_back(gid);
-            }
-            if (group_elements.size()>=group_size) {
-                groups.emplace_back(k, std::move(group_elements), backend);
+            // adding the current group would go beyond alloted size, so add to the list
+            // of groups and start a new one
+            if (group_elements.size() + component.size() > params.size && !group_elements.empty()) {
+                groups.emplace_back(params.kind, std::move(group_elements), params.backend);
                 group_elements.clear();
             }
+            // we are clear to add the current component. NOTE this may exceed
+            // the alloted size, but only by the minimal amount manageable
+            group_elements.insert(group_elements.end(), component.begin(), component.end());
         }
-        if (!group_elements.empty()) {
-            groups.emplace_back(k, std::move(group_elements), backend);
-        }
+        // we may have a trailing, incomplete group, so add this
+        if (!group_elements.empty()) groups.emplace_back(params.kind, std::move(group_elements), params.backend);
     }
 
     return {rec, ctx, groups};

From b431cd74b9255fdecb3432bcccb550e1ac67dfe1 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:19:32 +0200
Subject: [PATCH 06/13] Add a duplicate connection to test.

---
 arbor/partition_load_balance.cpp        | 6 +++---
 test/unit/test_domain_decomposition.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 9cf1b2c0dc..886e6c0574 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -174,7 +174,7 @@ auto build_local_components(const recipe& rec, context ctx) {
     return build_components(global_gj_connection_table, local_gid_range);
 }
 
-}
+} // namespace
 
 ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
                                                           context ctx,
@@ -204,7 +204,7 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
         for (auto cell: kind_lists[params.kind]) {
             const auto& component = components[cell];
             // adding the current group would go beyond alloted size, so add to the list
-            // of groups and start a new one
+            // of groups and start a new one.
             if (group_elements.size() + component.size() > params.size && !group_elements.empty()) {
                 groups.emplace_back(params.kind, std::move(group_elements), params.backend);
                 group_elements.clear();
@@ -213,7 +213,7 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
             // the alloted size, but only by the minimal amount manageable
             group_elements.insert(group_elements.end(), component.begin(), component.end());
         }
-        // we may have a trailing, incomplete group, so add this
+        // we may have a trailing, incomplete group, so add it.
         if (!group_elements.empty()) groups.emplace_back(params.kind, std::move(group_elements), params.backend);
     }
 
diff --git a/test/unit/test_domain_decomposition.cpp b/test/unit/test_domain_decomposition.cpp
index ab8a7c3e4a..fd3d0a555d 100644
--- a/test/unit/test_domain_decomposition.cpp
+++ b/test/unit/test_domain_decomposition.cpp
@@ -444,7 +444,7 @@ TEST(domain_decomposition, unidirectional_gj_recipe) {
                 {},
                 {},
                 {},
-                {gap_junction_connection({4, "gj"}, {"gj"}, 0.1)},
+                {gap_junction_connection({4, "gj"}, {"gj"}, 0.1), gap_junction_connection({4, "gj"}, {"gj"}, 0.1)},
                 {},
                 {},
                 {gap_junction_connection({5, "gj"}, {"gj"}, 0.1), gap_junction_connection({7, "gj"}, {"gj"}, 0.1)},

From 27a0aa7ee0a8cf46e733ca0d30c4bc0dc377a971 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:32:51 +0200
Subject: [PATCH 07/13] Style and polish.

---
 arbor/partition_load_balance.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 886e6c0574..473e4efdaf 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -24,6 +24,7 @@ namespace {
 using gj_connection_set   = std::unordered_set<cell_gid_type>;
 using gj_connection_table = std::unordered_map<cell_gid_type, gj_connection_set>;
 using gid_range           = std::pair<cell_gid_type, cell_gid_type>;
+using super_cell          = std::vector<cell_gid_type>;
 
 // Build global GJ connectivity table such that
 // * table[gid] is the set of all gids connected to gid via a GJ
@@ -83,8 +84,9 @@ auto make_local_gid_range(context ctx, cell_gid_type num_global_cells) {
 auto build_components(const gj_connection_table& global_gj_connection_table,
                       gid_range local_gid_range) {
     // cells connected by gj
-    std::vector<std::vector<cell_gid_type>> super_cells;
-    std::vector<std::vector<cell_gid_type>> res;
+    std::vector<super_cell> super_cells;
+    // singular cells
+    std::vector<super_cell> res;
     // track visited cells (cells that already belong to a group)
     gj_connection_set visited;
     // Connected components via BFS
@@ -92,20 +94,19 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
     for (auto gid: util::make_span(local_gid_range)) {
         if (global_gj_connection_table.count(gid)) {
             // If cell hasn't been visited yet, must belong to new component
-            if (!visited.count(gid)) {
-                visited.insert(gid);
-                std::vector<cell_gid_type> cg;
+            if (visited.insert(gid).second) {
                 q.push(gid);
+                super_cell sc;
                 while (!q.empty()) {
                     auto element = q.front();
                     q.pop();
-                    cg.push_back(element);
-                    // Adjacency list
+                    sc.push_back(element);
+                    // traverse conjoined cells
                     for (const auto& peer: global_gj_connection_table.at(element)) {
                         if (visited.insert(peer).second) q.push(peer);
                     }
                 }
-                super_cells.emplace_back(std::move(cg));
+                super_cells.emplace_back(std::move(sc));
             }
         }
         else {
@@ -117,8 +118,10 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
     // group belongs to our domain
     for (auto sc: super_cells) {
         std::sort(sc.begin(), sc.end());
-        if (!sc.empty() && sc.front() >= local_gid_range.first) res.emplace_back(std::move(sc));
+        // SAFETY super cells are never empty.
+        if (sc.front() >= local_gid_range.first) res.emplace_back(std::move(sc));
     }
+
     return res;
 }
 

From 8195b52b0f11a7275c2973859c64c4cbec5c8e96 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:44:36 +0200
Subject: [PATCH 08/13] More comments less work.

---
 arbor/partition_load_balance.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 473e4efdaf..3adafc6ee5 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -95,18 +95,27 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
         if (global_gj_connection_table.count(gid)) {
             // If cell hasn't been visited yet, must belong to new component
             if (visited.insert(gid).second) {
+                // pivot gid: the smallest found in this group; must be at
+                // smaller or equal to `gid`.
+                auto min_gid = gid;
                 q.push(gid);
                 super_cell sc;
                 while (!q.empty()) {
                     auto element = q.front();
                     q.pop();
                     sc.push_back(element);
-                    // traverse conjoined cells
+                    min_gid = std::min(element, min_gid);
+                    // queue up conjoined cells
                     for (const auto& peer: global_gj_connection_table.at(element)) {
                         if (visited.insert(peer).second) q.push(peer);
                     }
                 }
-                super_cells.emplace_back(std::move(sc));
+                // if the pivot gid belongs to our domain, this group will be part
+                // of our domain, keep it and sort.
+                if (min_gid >= local_gid_range.first) {
+                    std::sort(sc.begin(), sc.end());
+                    super_cells.emplace_back(std::move(sc));
+                }
             }
         }
         else {
@@ -114,14 +123,8 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
         }
     }
 
-    // Sort super_cell groups and only keep those where the first element in the
-    // group belongs to our domain
-    for (auto sc: super_cells) {
-        std::sort(sc.begin(), sc.end());
-        // SAFETY super cells are never empty.
-        if (sc.front() >= local_gid_range.first) res.emplace_back(std::move(sc));
-    }
-
+    // append super cells to result
+    res.insert(res.end(), super_cells.begin(), super_cells.end());
     return res;
 }
 

From cd13d2df8e6b6d5adfc4f75cb7f3862a851f4772 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Fri, 11 Aug 2023 21:11:04 +0200
Subject: [PATCH 09/13] Moar simple, moar correct connection table.

Also, add weird test to see that we can _almost_ construct the partition.
---
 arbor/domain_decomposition.cpp                |  36 +--
 arbor/execution_context.hpp                   |   1 -
 arbor/partition_load_balance.cpp              |  19 +-
 .../test_domain_decomposition.cpp             |  16 +-
 test/unit/test_domain_decomposition.cpp       | 293 ++++++++++++------
 5 files changed, 216 insertions(+), 149 deletions(-)

diff --git a/arbor/domain_decomposition.cpp b/arbor/domain_decomposition.cpp
index aa22082120..f9c14a03d9 100644
--- a/arbor/domain_decomposition.cpp
+++ b/arbor/domain_decomposition.cpp
@@ -13,11 +13,9 @@
 #include "util/span.hpp"
 
 namespace arb {
-domain_decomposition::domain_decomposition(
-    const recipe& rec,
-    context ctx,
-    const std::vector<group_description>& groups)
-{
+domain_decomposition::domain_decomposition(const recipe& rec,
+                                           context ctx,
+                                           const std::vector<group_description>& groups) {
     struct partition_gid_domain {
         partition_gid_domain(const gathered_vector<cell_gid_type>& divs, unsigned domains) {
             auto rank_part = util::partition_view(divs.partition());
@@ -27,9 +25,7 @@ domain_decomposition::domain_decomposition(
                 }
             }
         }
-        int operator()(cell_gid_type gid) const {
-            return gid_map.at(gid);
-        }
+        int operator()(cell_gid_type gid) const { return gid_map.at(gid); }
         std::unordered_map<cell_gid_type, int> gid_map;
     };
 
@@ -41,22 +37,14 @@ domain_decomposition::domain_decomposition(
 
     std::vector<cell_gid_type> local_gids;
     for (const auto& g: groups) {
-        if (g.backend == backend_kind::gpu && !has_gpu) {
-            throw invalid_backend(domain_id);
-        }
-        if (g.backend == backend_kind::gpu && g.kind != cell_kind::cable) {
-            throw incompatible_backend(domain_id, g.kind);
-        }
+        if (g.backend == backend_kind::gpu && !has_gpu) throw invalid_backend(domain_id);
+        if (g.backend == backend_kind::gpu && g.kind != cell_kind::cable) throw incompatible_backend(domain_id, g.kind);
 
         std::unordered_set<cell_gid_type> gid_set(g.gids.begin(), g.gids.end());
         for (const auto& gid: g.gids) {
-            if (gid >= num_global_cells) {
-                throw out_of_bounds(gid, num_global_cells);
-            }
+            if (gid >= num_global_cells) throw out_of_bounds(gid, num_global_cells);
             for (const auto& gj: rec.gap_junctions_on(gid)) {
-                if (!gid_set.count(gj.peer.gid)) {
-                    throw invalid_gj_cell_group(gid, gj.peer.gid);
-                }
+                if (!gid_set.count(gj.peer.gid)) throw invalid_gj_cell_group(gid, gj.peer.gid);
             }
         }
         local_gids.insert(local_gids.end(), g.gids.begin(), g.gids.end());
@@ -64,16 +52,12 @@ domain_decomposition::domain_decomposition(
     cell_size_type num_local_cells = local_gids.size();
 
     auto global_gids = dist->gather_gids(local_gids);
-    if (global_gids.size() != num_global_cells) {
-        throw invalid_sum_local_cells(global_gids.size(), num_global_cells);
-    }
+    if (global_gids.size() != num_global_cells) throw invalid_sum_local_cells(global_gids.size(), num_global_cells);
 
     auto global_gid_vals = global_gids.values();
     util::sort(global_gid_vals);
     for (unsigned i = 1; i < global_gid_vals.size(); ++i) {
-        if (global_gid_vals[i] == global_gid_vals[i-1]) {
-            throw duplicate_gid(global_gid_vals[i]);
-        }
+        if (global_gid_vals[i] == global_gid_vals[i-1]) throw duplicate_gid(global_gid_vals[i]);
     }
 
     num_domains_ = num_domains;
diff --git a/arbor/execution_context.hpp b/arbor/execution_context.hpp
index fb75f60cbd..b14fe01b34 100644
--- a/arbor/execution_context.hpp
+++ b/arbor/execution_context.hpp
@@ -34,7 +34,6 @@ struct ARB_ARBOR_API execution_context {
 
     template <typename Comm>
     execution_context(const proc_allocation& resources, Comm comm, Comm remote);
-
 };
 
 } // namespace arb
diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 3adafc6ee5..1102291dde 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -18,6 +18,8 @@
 #include "util/span.hpp"
 #include "util/strprintf.hpp"
 
+#include <iostream>
+
 namespace arb {
 
 namespace {
@@ -31,22 +33,14 @@ using super_cell          = std::vector<cell_gid_type>;
 // * iff A in table[B], then B in table[A]
 auto build_global_gj_connection_table(const recipe& rec) {
     gj_connection_table res;
-
-    // Collect all explicit GJ connections
+    // Collect all explicit GJ connections and make them bi-directional
     for (cell_gid_type gid = 0; gid < rec.num_cells(); ++gid) {
         for (const auto& gj: rec.gap_junctions_on(gid)) {
-            res[gid].insert(gj.peer.gid);
+            auto peer = gj.peer.gid;
+            res[gid].insert(peer);
+            res[peer].insert(gid);
         }
     }
-
-    // Make all gj_connections bidirectional.
-    for (auto& [gid, local_conns]: res) {
-         for (auto peer: local_conns) {
-            auto& peer_conns = res[peer];
-            if (!peer_conns.count(gid)) peer_conns.insert(gid);
-        }
-    }
-
     return res;
 }
 
@@ -122,7 +116,6 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
             res.push_back({gid});
         }
     }
-
     // append super cells to result
     res.insert(res.end(), super_cells.begin(), super_cells.end());
     return res;
diff --git a/test/unit-distributed/test_domain_decomposition.cpp b/test/unit-distributed/test_domain_decomposition.cpp
index 40e76c6dc8..517497e90b 100644
--- a/test/unit-distributed/test_domain_decomposition.cpp
+++ b/test/unit-distributed/test_domain_decomposition.cpp
@@ -390,7 +390,7 @@ TEST(domain_decomposition, symmetric_groups) {
     std::vector<gj_symmetric> recipes = {gj_symmetric(nranks, true), gj_symmetric(nranks, false)};
     for (const auto& R: recipes) {
         const auto D0 = partition_load_balance(R, ctx);
-        EXPECT_EQ(6u, D0.num_groups());
+        // EXPECT_EQ(6u, D0.num_groups());
 
         unsigned shift = rank * R.num_cells()/nranks;
         std::vector<std::vector<cell_gid_type>> expected_groups0 =
@@ -403,7 +403,7 @@ TEST(domain_decomposition, symmetric_groups) {
                 };
 
         for (unsigned i = 0; i < 6; i++) {
-            EXPECT_EQ(expected_groups0[i], D0.group(i).gids);
+            // EXPECT_EQ(expected_groups0[i], D0.group(i).gids);
         }
 
         unsigned cells_per_rank = R.num_cells()/nranks;
@@ -417,33 +417,33 @@ TEST(domain_decomposition, symmetric_groups) {
         hints[cell_kind::cable].prefer_gpu = false;
 
         const auto D1 = partition_load_balance(R, ctx, hints);
-        EXPECT_EQ(1u, D1.num_groups());
+        // EXPECT_EQ(1u, D1.num_groups());
 
         std::vector<cell_gid_type> expected_groups1 =
                 {0 + shift, 3 + shift, 4 + shift, 5 + shift, 8 + shift,
                  1 + shift, 2 + shift, 6 + shift, 7 + shift, 9 + shift};
 
-        EXPECT_EQ(expected_groups1, D1.group(0).gids);
+        // EXPECT_EQ(expected_groups1, D1.group(0).gids);
 
         for (unsigned i = 0; i < R.num_cells(); i++) {
-            EXPECT_EQ(i/cells_per_rank, (unsigned) D1.gid_domain(i));
+            // EXPECT_EQ(i/cells_per_rank, (unsigned) D1.gid_domain(i));
         }
 
         hints[cell_kind::cable].cpu_group_size = cells_per_rank/2;
         hints[cell_kind::cable].prefer_gpu = false;
 
         const auto D2 = partition_load_balance(R, ctx, hints);
-        EXPECT_EQ(2u, D2.num_groups());
+        // EXPECT_EQ(2u, D2.num_groups());
 
         std::vector<std::vector<cell_gid_type>> expected_groups2 =
                 {{0 + shift, 3 + shift, 4 + shift, 5 + shift, 8 + shift},
                  {1 + shift, 2 + shift, 6 + shift, 7 + shift, 9 + shift}};
 
         for (unsigned i = 0; i < 2u; i++) {
-            EXPECT_EQ(expected_groups2[i], D2.group(i).gids);
+            // EXPECT_EQ(expected_groups2[i], D2.group(i).gids);
         }
         for (unsigned i = 0; i < R.num_cells(); i++) {
-            EXPECT_EQ(i/cells_per_rank, (unsigned) D2.gid_domain(i));
+            // EXPECT_EQ(i/cells_per_rank, (unsigned) D2.gid_domain(i));
         }
     }
 }
diff --git a/test/unit/test_domain_decomposition.cpp b/test/unit/test_domain_decomposition.cpp
index fd3d0a555d..606cb87aa6 100644
--- a/test/unit/test_domain_decomposition.cpp
+++ b/test/unit/test_domain_decomposition.cpp
@@ -7,11 +7,11 @@
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/load_balance.hpp>
 #include <arbor/version.hpp>
-
 #include <arborenv/default_env.hpp>
 
 #include "util/span.hpp"
-
+#include "../execution_context.hpp"
+#include "../distributed_context.hpp"
 #include "../common_cells.hpp"
 #include "../simple_recipes.hpp"
 
@@ -30,122 +30,152 @@ using arb::util::make_span;
 // partition_load_balance into components that can be tested in isolation.
 
 namespace {
-    // Dummy recipes types for testing.
+// Dummy recipes types for testing.
 
-    struct dummy_cell {};
-    using homo_recipe = homogeneous_recipe<cell_kind::cable, dummy_cell>;
+struct dummy_cell {};
+using homo_recipe = homogeneous_recipe<cell_kind::cable, dummy_cell>;
 
-    // Heterogenous cell population of cable and spike source cells.
-    // Interleaved so that cells with even gid are cable cells, and odd gid are
-    // spike source cells.
-    class hetero_recipe: public recipe {
-    public:
-        hetero_recipe(cell_size_type s): size_(s) {}
+// Heterogenous cell population of cable and spike source cells.
+// Interleaved so that cells with even gid are cable cells, and odd gid are
+// spike source cells.
+class hetero_recipe: public recipe {
+public:
+    hetero_recipe(cell_size_type s): size_(s) {}
 
-        cell_size_type num_cells() const override {
-            return size_;
-        }
+    cell_size_type num_cells() const override {
+        return size_;
+    }
 
-        util::unique_any get_cell_description(cell_gid_type) const override {
-            return {};
-        }
+    util::unique_any get_cell_description(cell_gid_type) const override {
+        return {};
+    }
 
-        cell_kind get_cell_kind(cell_gid_type gid) const override {
-            return gid%2?
-                cell_kind::spike_source:
-                cell_kind::cable;
-        }
+    cell_kind get_cell_kind(cell_gid_type gid) const override {
+        return gid%2?
+            cell_kind::spike_source:
+            cell_kind::cable;
+    }
 
-    private:
-        cell_size_type size_;
-    };
+private:
+    cell_size_type size_;
+};
 
-    class gap_recipe: public recipe {
-    public:
-        gap_recipe(bool full_connected): fully_connected_(full_connected) {}
+class gap_recipe: public recipe {
+public:
+    gap_recipe(bool full_connected): fully_connected_(full_connected) {}
 
-        cell_size_type num_cells() const override {
-            return size_;
-        }
+    cell_size_type num_cells() const override {
+        return size_;
+    }
 
-        arb::util::unique_any get_cell_description(cell_gid_type) const override {
-            auto c = arb::make_cell_soma_only(false);
-            c.decorations.place(mlocation{0,1}, junction("gj"), "gj");
-            return {arb::cable_cell(c)};
-        }
+    arb::util::unique_any get_cell_description(cell_gid_type) const override {
+        auto c = arb::make_cell_soma_only(false);
+        c.decorations.place(mlocation{0,1}, junction("gj"), "gj");
+        return {arb::cable_cell(c)};
+    }
 
-        cell_kind get_cell_kind(cell_gid_type gid) const override {
-            return cell_kind::cable;
-        }
-        std::vector<gap_junction_connection> gap_junctions_on(cell_gid_type gid) const override {
-            switch (gid) {
-                case 0:  return {gap_junction_connection({13, "gj"}, {"gj"}, 0.1)};
-                case 2:  return {gap_junction_connection({7,  "gj"}, {"gj"}, 0.1)};
-                case 3:  return {gap_junction_connection({8, "gj"}, {"gj"}, 0.1)};
-                case 4: {
-                    if (!fully_connected_) return {gap_junction_connection({9, "gj"}, {"gj"}, 0.1)};
-                    return {
-                        gap_junction_connection({8, "gj"}, {"gj"}, 0.1),
-                        gap_junction_connection({9, "gj"}, {"gj"}, 0.1)
-                    };
-                }
-                case 7: {
-                    if (!fully_connected_) return {};
-                    return {
-                        gap_junction_connection({2, "gj"}, {"gj"}, 0.1),
-                        gap_junction_connection({11, "gj"}, {"gj"}, 0.1)
-                    };
-                }
-                case 8: {
-                    if (!fully_connected_) return {gap_junction_connection({4, "gj"}, {"gj"}, 0.1)};
-                    return {
-                        gap_junction_connection({3, "gj"}, {"gj"}, 0.1),
-                        gap_junction_connection({4, "gj"}, {"gj"}, 0.1)
-                    };
-                }
-                case 9: {
-                    if (!fully_connected_) return {};
-                    return {gap_junction_connection({4, "gj"}, {"gj"}, 0.1)};
-                }
-                case 11: return {gap_junction_connection({7, "gj"}, {"gj"}, 0.1)};
-                case 13: {
-                    if (!fully_connected_) return {};
-                    return { gap_junction_connection({0, "gj"}, {"gj"}, 0.1)};
-                }
-                default: return {};
+    cell_kind get_cell_kind(cell_gid_type gid) const override {
+        return cell_kind::cable;
+    }
+    std::vector<gap_junction_connection> gap_junctions_on(cell_gid_type gid) const override {
+        switch (gid) {
+            case 0:  return {gap_junction_connection({13, "gj"}, {"gj"}, 0.1)};
+            case 2:  return {gap_junction_connection({7,  "gj"}, {"gj"}, 0.1)};
+            case 3:  return {gap_junction_connection({8, "gj"}, {"gj"}, 0.1)};
+            case 4: {
+                if (!fully_connected_) return {gap_junction_connection({9, "gj"}, {"gj"}, 0.1)};
+                return {
+                    gap_junction_connection({8, "gj"}, {"gj"}, 0.1),
+                    gap_junction_connection({9, "gj"}, {"gj"}, 0.1)
+                };
+            }
+            case 7: {
+                if (!fully_connected_) return {};
+                return {
+                    gap_junction_connection({2, "gj"}, {"gj"}, 0.1),
+                    gap_junction_connection({11, "gj"}, {"gj"}, 0.1)
+                };
+            }
+            case 8: {
+                if (!fully_connected_) return {gap_junction_connection({4, "gj"}, {"gj"}, 0.1)};
+                return {
+                    gap_junction_connection({3, "gj"}, {"gj"}, 0.1),
+                    gap_junction_connection({4, "gj"}, {"gj"}, 0.1)
+                };
+            }
+            case 9: {
+                if (!fully_connected_) return {};
+                return {gap_junction_connection({4, "gj"}, {"gj"}, 0.1)};
+            }
+            case 11: return {gap_junction_connection({7, "gj"}, {"gj"}, 0.1)};
+            case 13: {
+                if (!fully_connected_) return {};
+                return { gap_junction_connection({0, "gj"}, {"gj"}, 0.1)};
             }
+            default: return {};
         }
+    }
 
-    private:
-        bool fully_connected_ = true;
-        cell_size_type size_ = 15;
-    };
+private:
+    bool fully_connected_ = true;
+    cell_size_type size_ = 15;
+};
 
-    class custom_gap_recipe: public recipe {
-    public:
-        custom_gap_recipe(cell_size_type ncells, std::vector<std::vector<gap_junction_connection>> gj_conns):
-        size_(ncells), gj_conns_(std::move(gj_conns)){}
+class custom_gap_recipe: public recipe {
+public:
+    custom_gap_recipe(cell_size_type ncells, std::vector<std::vector<gap_junction_connection>> gj_conns):
+    size_(ncells), gj_conns_(std::move(gj_conns)){}
 
-        cell_size_type num_cells() const override {
-            return size_;
-        }
+    cell_size_type num_cells() const override {
+        return size_;
+    }
 
-        arb::util::unique_any get_cell_description(cell_gid_type) const override {
-            auto c = arb::make_cell_soma_only(false);
-            c.decorations.place(mlocation{0,1}, junction("gj"), "gj");
-            return {arb::cable_cell(c)};
-        }
+    arb::util::unique_any get_cell_description(cell_gid_type) const override {
+        auto c = arb::make_cell_soma_only(false);
+        c.decorations.place(mlocation{0,1}, junction("gj"), "gj");
+        return {arb::cable_cell(c)};
+    }
+
+    cell_kind get_cell_kind(cell_gid_type gid) const override {
+        return cell_kind::cable;
+    }
+    std::vector<gap_junction_connection> gap_junctions_on(cell_gid_type gid) const override {
+        return gj_conns_[gid];
+    }
+private:
+    cell_size_type size_ = 7;
+    std::vector<std::vector<gap_junction_connection>> gj_conns_;
+};
+
+struct unimplemented: std::runtime_error {
+    unimplemented(const std::string& f): std::runtime_error{f} {}
+};
+
+struct dummy_context {
+    dummy_context(int i, int s): size_{s}, id_{i} {}
+
+    int size_ = 1;
+    int id_ = 0;
+
+    gathered_vector<spike> gather_spikes(const std::vector<spike>&) const { throw unimplemented{__FUNCTION__}; }
+    std::vector<spike> remote_gather_spikes(const std::vector<spike>&) const { throw unimplemented{__FUNCTION__}; }
+    gathered_vector<cell_gid_type> gather_gids(const std::vector<cell_gid_type>& local_gids) const { throw unimplemented{__FUNCTION__}; }
+    void remote_ctrl_send_continue(const epoch&) const {}
+    void remote_ctrl_send_done() const {}
+    cell_label_range gather_cell_label_range(const cell_label_range& local_ranges) const { throw unimplemented{__FUNCTION__}; }
+    cell_labels_and_gids gather_cell_labels_and_gids(const cell_labels_and_gids& local_labels_and_gids) const { throw unimplemented{__FUNCTION__}; }
+    template <typename T> std::vector<T> gather(T value, int) const { throw unimplemented{__FUNCTION__}; }
+
+    int id() const { return id_; }
+    int size() const { return size_; }
+
+    template <typename T> T min(T value) const { return value; }
+    template <typename T> T max(T value) const { return value; }
+    template <typename T> T sum(T value) const { return value; }
+    void barrier() const {}
+    std::string name() const { return "dummy"; }
+};
 
-        cell_kind get_cell_kind(cell_gid_type gid) const override {
-            return cell_kind::cable;
-        }
-        std::vector<gap_junction_connection> gap_junctions_on(cell_gid_type gid) const override {
-            return gj_conns_[gid];
-        }
-    private:
-        cell_size_type size_ = 7;
-        std::vector<std::vector<gap_junction_connection>> gj_conns_;
-    };
 }
 
 // test assumes one domain
@@ -575,3 +605,64 @@ TEST(domain_decomposition, invalid) {
         EXPECT_THROW(domain_decomposition(rec, ctx, groups), invalid_gj_cell_group);
     }
 }
+
+struct gj_symmetric: public recipe {
+    gj_symmetric(unsigned num_ranks, bool fully_connected):
+        ncopies_(num_ranks),
+        fully_connected_(fully_connected) {}
+
+    cell_size_type num_cells_per_rank() const { return size_; }
+    cell_size_type num_cells() const override { return size_*ncopies_; }
+    arb::util::unique_any get_cell_description(cell_gid_type) const override { return {}; }
+    cell_kind get_cell_kind(cell_gid_type gid) const override { return cell_kind::cable; }
+
+    std::vector<gap_junction_connection> gap_junctions_on(cell_gid_type gid) const override {
+        unsigned shift = (gid/size_)*size_;
+        switch (gid % size_) {
+            case 1 :  {
+                if (!fully_connected_) return {};
+                return {gap_junction_connection({7 + shift, "gj"}, {"gj"}, 0.1)};
+            }
+            case 2 :  {
+                if (!fully_connected_) return {};
+                return {
+                    gap_junction_connection({6 + shift, "gj"}, {"gj"}, 0.1),
+                    gap_junction_connection({9 + shift, "gj"}, {"gj"}, 0.1)
+                };
+            }
+            case 6 :  return {
+                gap_junction_connection({2 + shift, "gj"}, {"gj"}, 0.1),
+                gap_junction_connection({7 + shift, "gj"}, {"gj"}, 0.1)
+            };
+            case 7 :  {
+                if (!fully_connected_)  {
+                    return {gap_junction_connection({1 + shift, "gj"}, {"gj"}, 0.1)};
+                }
+                return {
+                    gap_junction_connection({6 + shift, "gj"}, {"gj"}, 0.1),
+                    gap_junction_connection({1 + shift, "gj"}, {"gj"}, 0.1)
+                };
+            }
+            case 9 :  return { gap_junction_connection({2 + shift, "gj"}, {"gj"}, 0.1)};
+            default : return {};
+        }
+    }
+
+    cell_size_type size_ = 10;
+    unsigned ncopies_;
+    bool fully_connected_;
+};
+
+TEST(domain_decomposition, symmetric_groups) {
+    for (int nranks = 1; nranks < 20; ++nranks) {
+        for (int rank = 0; rank < nranks; ++rank) {
+            auto ctx = make_context();
+            ctx->distributed = std::make_shared<distributed_context>(dummy_context{rank, nranks});
+            for (const auto& R: {gj_symmetric(nranks, true), gj_symmetric(nranks, false)}) {
+                try {
+                    const auto D0 = partition_load_balance(R, {ctx});
+                } catch (const unimplemented&) {}
+            }
+        }
+    }
+}

From 29f2f9843b3b0dedc055e905e94bea1b86e4c13d Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Mon, 14 Aug 2023 08:39:31 +0200
Subject: [PATCH 10/13] Reenable tests.

---
 .../test_domain_decomposition.cpp                | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/unit-distributed/test_domain_decomposition.cpp b/test/unit-distributed/test_domain_decomposition.cpp
index 517497e90b..40e76c6dc8 100644
--- a/test/unit-distributed/test_domain_decomposition.cpp
+++ b/test/unit-distributed/test_domain_decomposition.cpp
@@ -390,7 +390,7 @@ TEST(domain_decomposition, symmetric_groups) {
     std::vector<gj_symmetric> recipes = {gj_symmetric(nranks, true), gj_symmetric(nranks, false)};
     for (const auto& R: recipes) {
         const auto D0 = partition_load_balance(R, ctx);
-        // EXPECT_EQ(6u, D0.num_groups());
+        EXPECT_EQ(6u, D0.num_groups());
 
         unsigned shift = rank * R.num_cells()/nranks;
         std::vector<std::vector<cell_gid_type>> expected_groups0 =
@@ -403,7 +403,7 @@ TEST(domain_decomposition, symmetric_groups) {
                 };
 
         for (unsigned i = 0; i < 6; i++) {
-            // EXPECT_EQ(expected_groups0[i], D0.group(i).gids);
+            EXPECT_EQ(expected_groups0[i], D0.group(i).gids);
         }
 
         unsigned cells_per_rank = R.num_cells()/nranks;
@@ -417,33 +417,33 @@ TEST(domain_decomposition, symmetric_groups) {
         hints[cell_kind::cable].prefer_gpu = false;
 
         const auto D1 = partition_load_balance(R, ctx, hints);
-        // EXPECT_EQ(1u, D1.num_groups());
+        EXPECT_EQ(1u, D1.num_groups());
 
         std::vector<cell_gid_type> expected_groups1 =
                 {0 + shift, 3 + shift, 4 + shift, 5 + shift, 8 + shift,
                  1 + shift, 2 + shift, 6 + shift, 7 + shift, 9 + shift};
 
-        // EXPECT_EQ(expected_groups1, D1.group(0).gids);
+        EXPECT_EQ(expected_groups1, D1.group(0).gids);
 
         for (unsigned i = 0; i < R.num_cells(); i++) {
-            // EXPECT_EQ(i/cells_per_rank, (unsigned) D1.gid_domain(i));
+            EXPECT_EQ(i/cells_per_rank, (unsigned) D1.gid_domain(i));
         }
 
         hints[cell_kind::cable].cpu_group_size = cells_per_rank/2;
         hints[cell_kind::cable].prefer_gpu = false;
 
         const auto D2 = partition_load_balance(R, ctx, hints);
-        // EXPECT_EQ(2u, D2.num_groups());
+        EXPECT_EQ(2u, D2.num_groups());
 
         std::vector<std::vector<cell_gid_type>> expected_groups2 =
                 {{0 + shift, 3 + shift, 4 + shift, 5 + shift, 8 + shift},
                  {1 + shift, 2 + shift, 6 + shift, 7 + shift, 9 + shift}};
 
         for (unsigned i = 0; i < 2u; i++) {
-            // EXPECT_EQ(expected_groups2[i], D2.group(i).gids);
+            EXPECT_EQ(expected_groups2[i], D2.group(i).gids);
         }
         for (unsigned i = 0; i < R.num_cells(); i++) {
-            // EXPECT_EQ(i/cells_per_rank, (unsigned) D2.gid_domain(i));
+            EXPECT_EQ(i/cells_per_rank, (unsigned) D2.gid_domain(i));
         }
     }
 }

From c86036f985a5961cba575c3810582a43deda4a33 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Wed, 16 Aug 2023 14:32:22 +0200
Subject: [PATCH 11/13] Another slight clean-up.

---
 arbor/include/arbor/common_types.hpp         |  6 +++--
 arbor/partition_load_balance.cpp             | 23 ++++++++++----------
 example/busyring/init-only-2048-complex.json |  4 ++--
 example/busyring/ring.cpp                    |  2 +-
 4 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/arbor/include/arbor/common_types.hpp b/arbor/include/arbor/common_types.hpp
index 34a4b2dddd..296bbe22d8 100644
--- a/arbor/include/arbor/common_types.hpp
+++ b/arbor/include/arbor/common_types.hpp
@@ -120,10 +120,12 @@ using probe_tag = int;
 using sample_size_type = std::int32_t;
 
 // Enumeration for execution back-end targets, as specified in domain decompositions.
-
+// NOTE(important): Given in order of priority, ie we will attempt schedule gpu before
+//                  MC groups, for reasons of effiency. Ugly, but as we do not have more
+//                  backends, this is OK for now.
 enum class backend_kind {
+    gpu,         //  Use gpu back-end when supported by cell_group implementation.
     multicore,   //  Use multicore back-end for all computation.
-    gpu          //  Use gpu back-end when supported by cell_group implementation.
 };
 
 // Enumeration used to indentify the cell type/kind, used by the model to
diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 1102291dde..2979d1000e 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -122,15 +122,16 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
 }
 
 // Figure what backend and group size to use
-auto get_backend(cell_kind k, const partition_hint_map& hint_map, bool has_gpu) {
-    const auto& hint = util::value_by_key_or(hint_map, k, {});
+auto get_backend(context ctx, cell_kind kind, const partition_hint_map& hint_map) {
+    auto has_gpu = ctx->gpu->has_gpu() && cell_kind_supported(kind, backend_kind::gpu, *ctx);
+    const auto& hint = util::value_by_key_or(hint_map, kind, {});
     if (!hint.cpu_group_size) {
         throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested cpu_cell_group size of {}",
-                                                 k, hint.cpu_group_size));
+                                                 kind, hint.cpu_group_size));
     }
     if (hint.prefer_gpu && !hint.gpu_group_size) {
         throw arbor_exception(arb::util::pprintf("unable to perform load balancing because {} has invalid suggested gpu_cell_group size of {}",
-                                                 k, hint.gpu_group_size));
+                                                 kind, hint.gpu_group_size));
     }
     if (hint.prefer_gpu && has_gpu) return std::make_pair(backend_kind::gpu, hint.gpu_group_size);
     return std::make_pair(backend_kind::multicore, hint.cpu_group_size);
@@ -138,14 +139,13 @@ auto get_backend(cell_kind k, const partition_hint_map& hint_map, bool has_gpu)
 
 struct group_parameters {
     cell_kind kind;
-    bool has_gpu;
     backend_kind backend;
     size_t size;
 };
 
-// Create a flat vector of the cell kinds present on this node, partitioned such
-// that kinds for which GPU implementation are listed before the others. This is
-// a very primitive attempt at scheduling; the cell_groups that run on the GPU
+// Create a flat vector of the cell kinds present on this node, sorted such that
+// kinds for which GPU implementation are listed before the others. This is a
+// very primitive attempt at scheduling; the cell_groups that run on the GPU
 // will be executed before other cell_groups, which is likely to be more
 // efficient.
 //
@@ -157,11 +157,10 @@ auto build_group_parameters(context ctx,
                             const std::unordered_map<cell_kind, std::vector<cell_gid_type>>& kind_lists) {
     std::vector<group_parameters> res;
     for (const auto& [kind, _gids]: kind_lists) {
-        auto has_gpu = ctx->gpu->has_gpu() && cell_kind_supported(kind, backend_kind::gpu, *ctx);
-        const auto& [backend, group_size] = get_backend(kind, hint_map, has_gpu);
-        res.push_back({kind, has_gpu, backend, group_size});
+        const auto& [backend, group_size] = get_backend(ctx, kind, hint_map);
+        res.push_back({kind, backend, group_size});
     }
-    std::partition(res.begin(), res.end(), [](const auto& p) { return p.has_gpu; });
+    util::sort_by(res, [](const auto& p) { return p.kind; });
     return res;
 }
 
diff --git a/example/busyring/init-only-2048-complex.json b/example/busyring/init-only-2048-complex.json
index 92c633a3f3..63f9020092 100644
--- a/example/busyring/init-only-2048-complex.json
+++ b/example/busyring/init-only-2048-complex.json
@@ -1,7 +1,7 @@
 {
     "name": "run_n=2045_d=10-complex=true",
-    "num-cells": 2048,
-    "synapses": 10,
+    "num-cells": 128,
+    "synapses": 20000,
     "min-delay": 5,
     "duration": 0.1,
     "ring-size": 4,
diff --git a/example/busyring/ring.cpp b/example/busyring/ring.cpp
index ad5b055655..7c36525523 100644
--- a/example/busyring/ring.cpp
+++ b/example/busyring/ring.cpp
@@ -186,7 +186,7 @@ int main(int argc, char** argv) {
         auto params = read_options(argc, argv);
 
         arb::proc_allocation resources;
-        resources.num_threads = arbenv::default_concurrency();
+        resources.num_threads = 1; arbenv::default_concurrency();
         resources.bind_threads = params.bind_threads;
 
 #ifdef ARB_MPI_ENABLED

From f4ea31e4d680e31accb94f631161294b17c4e906 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Thu, 17 Aug 2023 15:10:43 +0200
Subject: [PATCH 12/13] Simplify test.

---
 arbor/partition_load_balance.cpp        | 18 ++++++++----------
 test/unit/test_domain_decomposition.cpp |  9 +++++----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 2979d1000e..93c5c10a79 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -1,18 +1,16 @@
-#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <algorithm>
 
 #include <arbor/domdecexcept.hpp>
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/load_balance.hpp>
 #include <arbor/recipe.hpp>
-#include <arbor/symmetric_recipe.hpp>
 #include <arbor/context.hpp>
 
 #include "cell_group_factory.hpp"
 #include "execution_context.hpp"
-#include "gpu_context.hpp"
 #include "util/maputil.hpp"
 #include "util/partition.hpp"
 #include "util/span.hpp"
@@ -84,7 +82,7 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
     // track visited cells (cells that already belong to a group)
     gj_connection_set visited;
     // Connected components via BFS
-    std::queue<cell_gid_type> q;
+    std::vector<cell_gid_type> q;
     for (auto gid: util::make_span(local_gid_range)) {
         if (global_gj_connection_table.count(gid)) {
             // If cell hasn't been visited yet, must belong to new component
@@ -92,16 +90,16 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
                 // pivot gid: the smallest found in this group; must be at
                 // smaller or equal to `gid`.
                 auto min_gid = gid;
-                q.push(gid);
+                q.push_back(gid);
                 super_cell sc;
                 while (!q.empty()) {
-                    auto element = q.front();
-                    q.pop();
+                    auto element = q.back();
+                    q.pop_back();
                     sc.push_back(element);
                     min_gid = std::min(element, min_gid);
                     // queue up conjoined cells
                     for (const auto& peer: global_gj_connection_table.at(element)) {
-                        if (visited.insert(peer).second) q.push(peer);
+                        if (visited.insert(peer).second) q.push_back(peer);
                     }
                 }
                 // if the pivot gid belongs to our domain, this group will be part
@@ -117,7 +115,8 @@ auto build_components(const gj_connection_table& global_gj_connection_table,
         }
     }
     // append super cells to result
-    res.insert(res.end(), super_cells.begin(), super_cells.end());
+    res.reserve(res.size() + super_cells.size());
+    std::move(super_cells.begin(), super_cells.end(), std::back_inserter(res));
     return res;
 }
 
@@ -218,4 +217,3 @@ ARB_ARBOR_API domain_decomposition partition_load_balance(const recipe& rec,
     return {rec, ctx, groups};
 }
 } // namespace arb
-
diff --git a/test/unit/test_domain_decomposition.cpp b/test/unit/test_domain_decomposition.cpp
index 606cb87aa6..7d6665a9de 100644
--- a/test/unit/test_domain_decomposition.cpp
+++ b/test/unit/test_domain_decomposition.cpp
@@ -654,14 +654,15 @@ struct gj_symmetric: public recipe {
 };
 
 TEST(domain_decomposition, symmetric_groups) {
+    auto ctx = make_context();
     for (int nranks = 1; nranks < 20; ++nranks) {
         for (int rank = 0; rank < nranks; ++rank) {
-            auto ctx = make_context();
             ctx->distributed = std::make_shared<distributed_context>(dummy_context{rank, nranks});
             for (const auto& R: {gj_symmetric(nranks, true), gj_symmetric(nranks, false)}) {
-                try {
-                    const auto D0 = partition_load_balance(R, {ctx});
-                } catch (const unimplemented&) {}
+                // NOTE: This is a bit silly, but allows us to test _most_ of
+                // the invariants without proper MPI support. If we could get `gather_gids` to
+                // work and return the expected values we could even test all of them.
+                EXPECT_THROW(partition_load_balance(R, {ctx}), unimplemented);
             }
         }
     }

From 89aa98596fa98eaf9395581894868d340fb4be02 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Tue, 5 Sep 2023 12:42:24 +0200
Subject: [PATCH 13/13] Remove remains of testing.

---
 arbor/partition_load_balance.cpp             | 2 --
 example/busyring/init-only-2048-complex.json | 4 ++--
 example/busyring/ring.cpp                    | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 93c5c10a79..550c2144b9 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -16,8 +16,6 @@
 #include "util/span.hpp"
 #include "util/strprintf.hpp"
 
-#include <iostream>
-
 namespace arb {
 
 namespace {
diff --git a/example/busyring/init-only-2048-complex.json b/example/busyring/init-only-2048-complex.json
index 63f9020092..92c633a3f3 100644
--- a/example/busyring/init-only-2048-complex.json
+++ b/example/busyring/init-only-2048-complex.json
@@ -1,7 +1,7 @@
 {
     "name": "run_n=2045_d=10-complex=true",
-    "num-cells": 128,
-    "synapses": 20000,
+    "num-cells": 2048,
+    "synapses": 10,
     "min-delay": 5,
     "duration": 0.1,
     "ring-size": 4,
diff --git a/example/busyring/ring.cpp b/example/busyring/ring.cpp
index 7c36525523..ad5b055655 100644
--- a/example/busyring/ring.cpp
+++ b/example/busyring/ring.cpp
@@ -186,7 +186,7 @@ int main(int argc, char** argv) {
         auto params = read_options(argc, argv);
 
         arb::proc_allocation resources;
-        resources.num_threads = 1; arbenv::default_concurrency();
+        resources.num_threads = arbenv::default_concurrency();
         resources.bind_threads = params.bind_threads;
 
 #ifdef ARB_MPI_ENABLED