diff --git a/c_code/comms_i.c b/c_code/comms_i.c
index ed04a8c..ab2cb5a 100644
--- a/c_code/comms_i.c
+++ b/c_code/comms_i.c
@@ -78,7 +78,7 @@ void i_processQueue (uint unused0, uint unused1)
 
     uint pkt_type = key & SPINN_TYPE_MASK;
 
-    // check if data packet,
+    // process data packet,
     if (pkt_type == SPINN_DATA_KEY)
     {
       // check packet phase and process accordingly
@@ -96,22 +96,20 @@ void i_processQueue (uint unused0, uint unused1)
       }
     }
 
-    // check if stop packet,
+    // or process stop packet,
     else if (pkt_type == SPINN_STOP_KEY)
     {
-      // stop packet received
       i_stop_packet (key);
     }
 
-    // check if network stop packet,
+    // or process network stop packet,
     else if (pkt_type == SPINN_STPN_KEY)
     {
-      // network stop packet received
       i_net_stop_packet (key);
     }
 
 #ifdef DEBUG
-    // report unknown packet type,
+    // or report unknown packet type,
     else
     {
       stage_done (SPINN_UNXPD_PKT, key);
diff --git a/c_code/comms_s.c b/c_code/comms_s.c
index a927679..7874d00 100644
--- a/c_code/comms_s.c
+++ b/c_code/comms_s.c
@@ -16,7 +16,7 @@
 // ------------------------------------------------------------------------
 // ------------------------------------------------------------------------
 // enqueue received packet
-// (FORWARD, BACKPROP, ldsa, ldst, stop and net_stop types)
+// (FORWARD, BACKPROP, lds, stop and net_stop types)
 // ------------------------------------------------------------------------
 void s_receivePacket (uint key, uint payload)
 {
@@ -77,7 +77,7 @@ void s_processQueue (uint unused0, uint unused1)
 
     uint pkt_type = key & SPINN_TYPE_MASK;
 
-    // check if data packet,
+    // process data packet,
     if (pkt_type == SPINN_DATA_KEY)
     {
       // check packet phase and process accordingly
@@ -95,36 +95,26 @@ void s_processQueue (uint unused0, uint unused1)
       }
     }
 
-    // check for an LDS "accumulation" packet,
+    // or process LDS packet,
     else if (pkt_type == SPINN_LDSA_KEY)
     {
-      // process LDS "accumulation" packet
-      s_ldsa_packet (payload);
+      s_lds_packet (payload);
     }
 
-    // check for LDS "total" packet,
-    else if (pkt_type == SPINN_LDST_KEY)
-    {
-      // process LDS "total" packet
-      s_ldst_packet (payload);
-    }
-
-    // check if stop packet,
+    // or process stop packet,
     else if (pkt_type == SPINN_STOP_KEY)
     {
-      // stop packet received
       s_stop_packet (key);
     }
 
-    // check if network stop packet,
+    // or process network stop packet,
     else if (pkt_type == SPINN_STPN_KEY)
     {
-      // network stop packet received
       s_net_stop_packet (key);
     }
 
 #ifdef DEBUG
-    // report unknown packet type,
+    // or report unknown packet type,
     else
     {
       stage_done (SPINN_UNXPD_PKT, key);
@@ -233,34 +223,34 @@ void s_net_stop_packet (uint key)
 
 
 // ------------------------------------------------------------------------
-// process LDSA packet: accumulate the received partial link delta sums
+// process LDS packet: accumulate the received partial link delta sums
 // ------------------------------------------------------------------------
-void s_ldsa_packet (uint payload)
+void s_lds_packet (uint payload)
 {
 #ifdef DEBUG
-  lda_recv++;
+  lds_recv++;
 #endif
 
   // add the received value to the total so far,
   s_lds_part += (lds_t) payload;
 
   // increment the count of partial link delta sums arrived,
-  s_ldsa_arrived++;
+  s_lds_arrived++;
 
   // check whether all the partial sums have arrived
-  if (s_ldsa_arrived == scfg.ldsa_expected)
+  if (s_lds_arrived == scfg.lds_expected)
   {
-    // send the result to the first s core
-    // to give a total across the whole network
-    if (scfg.is_first_group == 0)
-    {
-      while (!spin1_send_mc_packet (ldstKey, s_lds_part, WITH_PAYLOAD));
+    // broadcast (first subgroup) or relay (all others) lds value
+    while (!spin1_send_mc_packet (ldsKey, s_lds_part, WITH_PAYLOAD));
 
 #ifdef DEBUG
-      pkt_sent++;
-      ldt_sent++;
+    pkt_sent++;
+    lds_sent++;
 #endif
-    }
+
+    // prepare for next epoch
+    s_lds_part = 0;
+    s_lds_arrived = 0;
 
     // access thread semaphore with interrupts disabled
     uint cpsr = spin1_int_disable ();
@@ -293,62 +283,3 @@ void s_ldsa_packet (uint payload)
   }
 }
 // ------------------------------------------------------------------------
-
-
-// ------------------------------------------------------------------------
-// process LDST packet: accumulate the received link delta sum totals
-// ------------------------------------------------------------------------
-void s_ldst_packet (uint payload)
-{
-#ifdef DEBUG
-  ldt_recv++;
-#endif
-
-  // add the received value to the total so far,
-  s_lds_part += (lds_t) payload;
-
-  // increment the count of link delta sums arrived,
-  s_ldst_arrived++;
-
-  // check whether all the partial sums have arrived
-  if (s_ldst_arrived == scfg.ldst_expected)
-  {
-    // send the final value of s_lds_part back to the w cores
-    while (!spin1_send_mc_packet (ldsrKey, s_lds_part, WITH_PAYLOAD));
-
-#ifdef DEBUG
-    pkt_sent++;
-    ldr_sent++;
-#endif
-
-    // access thread semaphore with interrupts disabled
-    uint cpsr = spin1_int_disable ();
-
-#if defined(DEBUG) && defined(DEBUG_THRDS)
-    if (!(sb_thrds_pend & SPINN_THRD_LDST))
-      wrng_cth++;
-#endif
-
-    // check if all other threads done
-    if (sb_thrds_pend == SPINN_THRD_LDST)
-    {
-      // if done initialise semaphore
-      sb_thrds_pend = SPINN_SB_THRDS;
-
-      // restore interrupts after semaphore access,
-      spin1_mode_restore (cpsr);
-
-      // and advance tick
-      sb_advance_tick ();
-    }
-    else
-    {
-      // if not done report processing thread done,
-      sb_thrds_pend &= ~SPINN_THRD_LDST;
-
-      // and restore interrupts after semaphore access
-      spin1_mode_restore (cpsr);
-    }
-  }
-}
-// ------------------------------------------------------------------------
diff --git a/c_code/comms_s.h b/c_code/comms_s.h
index be02cc7..6360d1f 100644
--- a/c_code/comms_s.h
+++ b/c_code/comms_s.h
@@ -7,7 +7,6 @@ void s_processQueue  (uint unused0, uint unused1);
 void s_stop_packet     (uint key);
 void s_net_stop_packet (uint key);
 
-void s_ldsa_packet     (uint payload);
-void s_ldst_packet     (uint payload);
+void s_lds_packet     (uint payload);
 
 #endif
diff --git a/c_code/comms_t.c b/c_code/comms_t.c
index b97359e..7f9d4e8 100644
--- a/c_code/comms_t.c
+++ b/c_code/comms_t.c
@@ -121,19 +121,19 @@ void t_processFWDQueue (uint unused0, uint unused1)
       tf_process (key, payload);
     }
 
-    // process criterion packet,
+    // or process criterion packet,
     else if (pkt_type == SPINN_CRIT_KEY)
     {
       t_criterion_packet (key);
     }
 
-    // process tick stop packet,
+    // or process tick stop packet,
     else if (pkt_type == SPINN_STOP_KEY)
     {
       t_stop_packet (key);
     }
 
-    // process network stop packet,
+    // or process network stop packet,
     else if (pkt_type == SPINN_STPN_KEY)
     {
       t_net_stop_packet (key);
@@ -170,38 +170,48 @@ void t_criterion_packet (uint key)
 #endif
 
   // partial criterion value arrived,
-  tf_crit_prev = key & SPINN_STPD_MASK;
+  tf_crit_prev = tf_crit_prev && (key & SPINN_STPD_MASK);
 
-  // access flag with interrupts disabled,
-  uint cpsr = spin1_int_disable ();
+  // update scoreboard,
+  tf_crit_arrived++;
 
-  // and check if updated criterion value can be forwarded
-  if (tf_crit_rdy)
+  // and check if all criterion packets arrived
+  if (tf_crit_arrived == tcfg.crit_expected)
   {
-    // initialise flag,
-    tf_crit_rdy = tf_init_crit;
+    // initialise scoreboard for next tick,
+    tf_crit_arrived = 0;
 
-    // restore interrupts after flag access,
-    spin1_mode_restore (cpsr);
+    // access flag with interrupts disabled,
+    uint cpsr = spin1_int_disable ();
 
-    // send stop packet,
-    tf_send_stop ();
-
-    // and advance tick if last_output_group
-    //NOTE: last output group does not get a tick stop packet
-    // so it's ready to advance tick
-    if (tcfg.is_last_output_group)
+    // and check if updated criterion value can be forwarded
+    if (tf_crit_rdy)
     {
-      tf_advance_tick ();
+      // initialise flag,
+      tf_crit_rdy = 0;
+
+      // restore interrupts after flag access,
+      spin1_mode_restore (cpsr);
+
+      // send stop packet,
+      tf_send_stop ();
+
+      // and advance tick if last_output_group
+      //NOTE: last output group does not get a tick stop packet
+      // so it's ready to advance tick
+      if (tcfg.is_last_output)
+      {
+        tf_advance_tick ();
+      }
     }
-  }
-  else
-  {
-    // flag ready to forward criterion,
-    tf_crit_rdy = 1;
+    else
+    {
+      // flag ready to forward criterion,
+      tf_crit_rdy = 1;
 
-    // and restore interrupts after flag access
-    spin1_mode_restore (cpsr);
+      // and restore interrupts after flag access
+      spin1_mode_restore (cpsr);
+    }
   }
 }
 // ------------------------------------------------------------------------
@@ -311,7 +321,7 @@ void t_backprop_packet (uint key, uint payload)
   // store received error,
   t_errors[tb_comms][inx] = (error_t) payload;
 
-  // and update scoreboard,
+  // update scoreboard,
   tb_arrived++;
 
   // if all expected errors have arrived may move to next tick
@@ -362,7 +372,12 @@ void tf_send_stop (void)
   // "aggregate" criteria,
   tf_stop_crit = tf_stop_crit && tf_crit_prev;
 
-  if (tcfg.is_last_output_group)
+  // initialise previous value,
+  //TODO: should this be done in critical section?
+  tf_crit_prev = TRUE;
+
+  // make stop decision,
+  if (tcfg.is_last_output)
   {
     tf_group_crit = tf_stop_crit;
 
@@ -386,7 +401,7 @@ void tf_send_stop (void)
 
 #ifdef DEBUG
   pkt_sent++;
-  if (tcfg.is_last_output_group)
+  if (tcfg.is_last_output)
   {
     stp_sent++;
   }
diff --git a/c_code/comms_w.c b/c_code/comms_w.c
index 96b4168..4fe1f0d 100644
--- a/c_code/comms_w.c
+++ b/c_code/comms_w.c
@@ -17,7 +17,7 @@
 // ------------------------------------------------------------------------
 // ------------------------------------------------------------------------
 // initial handling of received packets
-// (FORWARD, BACKPROP, ldsr, stop, net_stop and sync types)
+// (FORWARD, BACKPROP, lds, stop, net_stop and sync types)
 // ------------------------------------------------------------------------
 void w_receivePacket (uint key, uint payload)
 {
@@ -142,10 +142,10 @@ void w_processBKPQueue (uint unused0, uint unused1)
       wb_process (key, payload);
     }
 
-    // process LDS result packet,
-    else if (pkt_type == SPINN_LDSR_KEY)
+    // or process LDS result packet,
+    else if (pkt_type == SPINN_LDSA_KEY)
     {
-      w_ldsr_packet (payload);
+      w_lds_packet (payload);
     }
 
 #ifdef DEBUG
@@ -178,13 +178,6 @@ void w_forward_packet (uint key, uint payload)
   recv_fwd++;
   if (phase == SPINN_BACKPROP)
     wrng_fph++;
-
-  uint blk = (key & SPINN_BLOCK_MASK) >> SPINN_BLOCK_SHIFT;
-  if (blk != wcfg.row_blk)
-  {
-    pkt_fwbk++;
-    return;
-  }
 #endif
 
   // get output index: mask out phase, core and block data,
@@ -361,10 +354,10 @@ void w_sync_packet (void)
 // ------------------------------------------------------------------------
 // process an LDS result packet
 // ------------------------------------------------------------------------
-void w_ldsr_packet (uint payload)
+void w_lds_packet (uint payload)
 {
 #ifdef DEBUG
-  ldr_recv++;
+  lds_recv++;
 #endif
 
   // the final link delta sum for the epoch arrived
@@ -374,12 +367,12 @@ void w_ldsr_packet (uint payload)
   uint cpsr = spin1_int_disable ();
 
 #if defined(DEBUG) && defined(DEBUG_THRDS)
-  if (!(wb_thrds_pend & SPINN_THRD_LDSR))
+  if (!(wb_thrds_pend & SPINN_THRD_LDSA))
     wrng_cth++;
 #endif
 
   // check if all other threads done
-  if (wb_thrds_pend == SPINN_THRD_LDSR)
+  if (wb_thrds_pend == SPINN_THRD_LDSA)
   {
     // initialise semaphore (no link delta summation in next tick),
     wb_thrds_pend = SPINN_WB_THRDS;
@@ -393,7 +386,7 @@ void w_ldsr_packet (uint payload)
   else
   {
     // if not done report processing thread done,
-    wb_thrds_pend &= ~SPINN_THRD_LDSR;
+    wb_thrds_pend &= ~SPINN_THRD_LDSA;
 
     // and restore interrupts after semaphore access
     spin1_mode_restore (cpsr);
diff --git a/c_code/comms_w.h b/c_code/comms_w.h
index cb41af1..c410cba 100644
--- a/c_code/comms_w.h
+++ b/c_code/comms_w.h
@@ -10,7 +10,7 @@ void w_stop_packet     (uint key);
 void w_net_stop_packet (uint key);
 void w_sync_packet     (void);
 
-void w_ldsr_packet (uint payload);
+void w_lds_packet (uint payload);
 
 void store_output    (uint index);
 void restore_outputs (uint tick);
diff --git a/c_code/init_i.c b/c_code/init_i.c
index 2afea6b..9b0a69c 100644
--- a/c_code/init_i.c
+++ b/c_code/init_i.c
@@ -26,6 +26,11 @@ uint cfg_init (void)
   io_printf (IO_BUF, "input\n");
 #endif
 
+#ifdef PROFILE
+  // configure timer 2 for profiling
+  tc[T2_CONTROL] = SPINN_PROFILER_CFG;
+#endif
+
   // read the data specification header
   data_specification_metadata_t * data =
           data_specification_get_data_address();
@@ -148,14 +153,6 @@ uint mem_init (void)
     return (SPINN_MEM_UNAVAIL);
   }
 
-  // allocate memory for BACKPROP keys (one per partition)
-  if ((i_bkpKey = ((uint *)
-         spin1_malloc (icfg.partitions * sizeof (uint)))) == NULL
-     )
-  {
-    return (SPINN_MEM_UNAVAIL);
-  }
-
   // allocate memory for INPUT functions
   for (uint i = 0; i < icfg.num_in_procs; i++)
   {
@@ -271,13 +268,9 @@ void var_init (uint reset_examples)
   i_pkt_queue.tail = 0;
 
   // initialise packet keys
-  //NOTE: colour is initialised to 0.
+  //NOTE: colour is implicitly initialised to 0
   fwdKey = rt[FWD] | SPINN_PHASE_KEY(SPINN_FORWARD);
-
-  for (uint p = 0; p < icfg.partitions; p++)
-  {
-    i_bkpKey[p] = rt[BKPI + p] | SPINN_PHASE_KEY (SPINN_BACKPROP);
-  }
+  bkpKey = rt[BKP] | SPINN_PHASE_KEY (SPINN_BACKPROP);
 
   // if the INPUT INTEGRATOR is used
   // reset the memory of the INTEGRATOR state variables
@@ -316,6 +309,17 @@ void var_init (uint reset_examples)
   tot_tick = 0;  // total number of ticks executed
   // ------------------------------------------------------------------------
 #endif
+
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+prf_fwd_min = SPINN_PROFILER_START;  // minimum FORWARD processing time
+prf_fwd_max = 0;                     // maximum FORWARD processing time
+prf_bkp_min = SPINN_PROFILER_START;  // minimum BACKPROP processing time
+prf_bkp_max = 0;                     // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
 }
 // ------------------------------------------------------------------------
 
@@ -430,6 +434,17 @@ void stage_done (uint ec, uint key)
   if (wrng_sth) io_printf (IO_BUF, "wrong sth:%d\n", wrng_sth);
 #endif
 
+#ifdef PROFILE
+  // report PROFILER values
+  io_printf (IO_BUF, "min fwd proc:%u\n", prf_fwd_min);
+  io_printf (IO_BUF, "max fwd proc:%u\n", prf_fwd_max);
+  if (xcfg.training)
+  {
+    io_printf (IO_BUF, "min bkp proc:%u\n", prf_bkp_min);
+    io_printf (IO_BUF, "max bkp proc:%u\n", prf_bkp_max);
+  }
+#endif
+
 #ifdef DEBUG
   // close log,
   io_printf (IO_BUF, "stopping stage %u\n", xcfg.stage_id);
diff --git a/c_code/init_s.c b/c_code/init_s.c
index 7ac9678..19f2ed5 100644
--- a/c_code/init_s.c
+++ b/c_code/init_s.c
@@ -25,6 +25,11 @@ uint cfg_init (void)
   io_printf (IO_BUF, "sum\n");
 #endif
 
+#ifdef PROFILE
+  // configure timer 2 for profiling
+  tc[T2_CONTROL] = SPINN_PROFILER_CFG;
+#endif
+
   // read the data specification header
   data_specification_metadata_t * data =
           data_specification_get_data_address();
@@ -89,8 +94,7 @@ uint cfg_init (void)
   io_printf (IO_BUF, "nu: %d\n", scfg.num_units);
   io_printf (IO_BUF, "fe: %d\n", scfg.fwd_expected);
   io_printf (IO_BUF, "be: %d\n", scfg.bkp_expected);
-  io_printf (IO_BUF, "ae: %d\n", scfg.ldsa_expected);
-  io_printf (IO_BUF, "te: %d\n", scfg.ldst_expected);
+  io_printf (IO_BUF, "le: %d\n", scfg.lds_expected);
   io_printf (IO_BUF, "uf: %d\n", xcfg.update_function);
   io_printf (IO_BUF, "fg: %d\n", scfg.is_first_group);
   io_printf (IO_BUF, "fk: 0x%08x\n", rt[FWD]);
@@ -229,8 +233,6 @@ void var_init (uint reset_examples)
   }
   sf_done = 0;
   sb_done = 0;
-  s_ldsa_arrived = 0;
-  s_ldst_arrived = 0;
 
   // initialise thread semaphores
   sf_thrds_pend = SPINN_SF_THRDS;
@@ -239,20 +241,20 @@ void var_init (uint reset_examples)
   // initialise processing thread flag
   s_active = FALSE;
 
-  // initialise partial lds
+  // initialise lds
   s_lds_part = 0;
+  s_lds_arrived = 0;
 
   // initialise packet queue
   s_pkt_queue.head = 0;
   s_pkt_queue.tail = 0;
 
   // initialise packet keys
-  //NOTE: colour is initialised to 0.
-  fwdKey  = rt[FWD] | SPINN_PHASE_KEY (SPINN_FORWARD);
-  bkpKey  = rt[BKP] | SPINN_PHASE_KEY (SPINN_BACKPROP);
-  ldstKey = rt[LDS] | SPINN_LDST_KEY | SPINN_PHASE_KEY (SPINN_BACKPROP);
-  ldsrKey = rt[LDS] | SPINN_LDSR_KEY | SPINN_PHASE_KEY (SPINN_BACKPROP);
-  fdsKey  = rt[FDS] | SPINN_SYNC_KEY | SPINN_PHASE_KEY (SPINN_FORWARD);
+  //NOTE: colour is implicitly initialised to 0
+  fwdKey = rt[FWD] | SPINN_PHASE_KEY (SPINN_FORWARD);
+  bkpKey = rt[BKP] | SPINN_PHASE_KEY (SPINN_BACKPROP);
+  ldsKey = rt[LDS] | SPINN_LDSA_KEY | SPINN_PHASE_KEY (SPINN_BACKPROP);
+  fdsKey = rt[FDS] | SPINN_SYNC_KEY | SPINN_PHASE_KEY (SPINN_FORWARD);
 
 #ifdef DEBUG
   // ------------------------------------------------------------------------
@@ -268,10 +270,8 @@ void var_init (uint reset_examples)
   stp_sent = 0;  // stop packets sent
   stp_recv = 0;  // stop packets received
   stn_recv = 0;  // network_stop packets received
-  lda_recv = 0;  // partial link_delta packets received
-  ldt_sent = 0;  // total link_delta packets sent
-  ldt_recv = 0;  // total link_delta packets received
-  ldr_sent = 0;  // link_delta packets sent
+  lds_recv = 0;  // link_delta packets received
+  lds_sent = 0;  // link_delta packets sent
   wrng_phs = 0;  // packets received in wrong phase
   wrng_pth = 0;  // unexpected processing thread
   wrng_cth = 0;  // unexpected comms thread
@@ -279,6 +279,17 @@ void var_init (uint reset_examples)
   tot_tick = 0;  // total number of ticks executed
   // ------------------------------------------------------------------------
 #endif
+
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+prf_fwd_min = SPINN_PROFILER_START;  // minimum FORWARD processing time
+prf_fwd_max = 0;                     // maximum FORWARD processing time
+prf_bkp_min = SPINN_PROFILER_START;  // minimum BACKPROP processing time
+prf_bkp_max = 0;                     // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
 }
 // ------------------------------------------------------------------------
 
@@ -392,16 +403,8 @@ void stage_done (uint ec, uint key)
   io_printf (IO_BUF, "total sent:%d\n", pkt_sent);
   io_printf (IO_BUF, "recv: fwd:%d bkp:%d\n", recv_fwd, recv_bkp);
   io_printf (IO_BUF, "sent: fwd:%d bkp:%d\n", sent_fwd, sent_bkp);
-  io_printf (IO_BUF, "ldsa recv:%d\n", lda_recv);
-  if (scfg.is_first_group)
-  {
-    io_printf (IO_BUF, "ldst recv:%d\n", ldt_recv);
-    io_printf (IO_BUF, "ldsr sent:%d\n", ldr_sent);
-  }
-  else
-  {
-    io_printf (IO_BUF, "ldst sent:%d\n", ldt_sent);
-  }
+  io_printf (IO_BUF, "lds sent:%d\n", lds_sent);
+  io_printf (IO_BUF, "lds recv:%d\n", lds_recv);
   io_printf (IO_BUF, "stop recv:%d\n", stp_recv);
   io_printf (IO_BUF, "stpn recv:%d\n", stn_recv);
   io_printf (IO_BUF, "sync sent:%d\n", spk_sent);
@@ -411,6 +414,17 @@ void stage_done (uint ec, uint key)
   if (wrng_sth) io_printf (IO_BUF, "wrong sth:%d\n", wrng_sth);
 #endif
 
+#ifdef PROFILE
+  // report PROFILER values
+  io_printf (IO_BUF, "min fwd proc:%u\n", prf_fwd_min);
+  io_printf (IO_BUF, "max fwd proc:%u\n", prf_fwd_max);
+  if (xcfg.training)
+  {
+    io_printf (IO_BUF, "min bkp proc:%u\n", prf_bkp_min);
+    io_printf (IO_BUF, "max bkp proc:%u\n", prf_bkp_max);
+  }
+#endif
+
 #ifdef DEBUG
   // close log,
   io_printf (IO_BUF, "stopping stage %u\n", xcfg.stage_id);
diff --git a/c_code/init_t.c b/c_code/init_t.c
index 64720f0..a9ce178 100644
--- a/c_code/init_t.c
+++ b/c_code/init_t.c
@@ -27,6 +27,11 @@ uint cfg_init (void)
   io_printf (IO_BUF, "threshold\n");
 #endif
 
+#ifdef PROFILE
+  // configure timer 2 for profiling
+  tc[T2_CONTROL] = SPINN_PROFILER_CFG;
+#endif
+
   // read the data specification header
   data_specification_metadata_t * data =
           data_specification_get_data_address();
@@ -150,8 +155,8 @@ uint cfg_init (void)
 #ifdef DEBUG_CFG
   io_printf (IO_BUF, "og: %d\n", tcfg.output_grp);
   io_printf (IO_BUF, "ig: %d\n", tcfg.input_grp);
+  io_printf (IO_BUF, "ls: %d\n", tcfg.is_last_sgrp);
   io_printf (IO_BUF, "nu: %d\n", tcfg.num_units);
-  io_printf (IO_BUF, "wb: %d\n", tcfg.write_blk);
   io_printf (IO_BUF, "ie: %d\n", tcfg.out_integr_en);
   io_printf (IO_BUF, "dt: %f\n", tcfg.out_integr_dt);
   io_printf (IO_BUF, "np: %d\n", tcfg.num_out_procs);
@@ -165,9 +170,10 @@ uint cfg_init (void)
         tcfg.initOutput, SPINN_ACTIV_SHIFT));
   io_printf (IO_BUF, "gs: %k\n", tcfg.tst_group_criterion);
   io_printf (IO_BUF, "gt: %k\n", tcfg.trn_group_criterion);
+  io_printf (IO_BUF, "ce: %d\n", tcfg.crit_expected);
   io_printf (IO_BUF, "cf: %d\n", tcfg.criterion_function);
-  io_printf (IO_BUF, "fg: %d\n", tcfg.is_first_output_group);
-  io_printf (IO_BUF, "lg: %d\n", tcfg.is_last_output_group);
+  io_printf (IO_BUF, "fo: %d\n", tcfg.is_first_output);
+  io_printf (IO_BUF, "lo: %d\n", tcfg.is_last_output);
   io_printf (IO_BUF, "ef: %d\n", tcfg.error_function);
   io_printf (IO_BUF, "fk: 0x%08x\n", rt[FWD]);
   io_printf (IO_BUF, "bk: 0x%08x\n", rt[BKP]);
@@ -239,14 +245,6 @@ uint mem_init (void)
     return (SPINN_MEM_UNAVAIL);
   }
 
-  // allocate memory for forward keys (one per partition)
-  if ((t_fwdKey = ((uint *)
-         spin1_malloc (tcfg.partitions * sizeof (uint)))) == NULL
-     )
-  {
-    return (SPINN_MEM_UNAVAIL);
-  }
-
   // allocate memory for OUTPUT functions
   for (uint i = 0; i < tcfg.num_out_procs; i++)
   {
@@ -488,7 +486,7 @@ void var_init (uint reset_examples, uint reset_epochs_trained)
   net_stop = 0;
 
   // initialise max and min ticks
-  if (tcfg.is_last_output_group)
+  if (tcfg.is_last_output)
   {
     // get max number of ticks for first event
     if (ev[event_idx].max_time != SPINN_FP_NaN)
@@ -535,19 +533,20 @@ void var_init (uint reset_examples, uint reset_epochs_trained)
   tb_procs = 0;
   tb_comms = 1;
 
-  // initialise received net and error scoreboards
+  // initialise received net, error and criterion scoreboards
   tf_arrived = 0;
   tb_arrived = 0;
+  tf_crit_arrived = 0;
 
   // initialise thread semaphores
   tf_thrds_pend = SPINN_TF_THRDS;
   tb_thrds_pend = SPINN_TB_THRDS;
 
   // initialise recording options
-  t_rec_results = xcfg.rec_results && tcfg.is_last_output_group &&
+  t_rec_results = xcfg.rec_results && tcfg.is_last_output &&
       !xcfg.training && (stage_rec_flags & (1 << SPINN_REC_RESULTS));
 
-  t_rec_tick_data = xcfg.rec_outputs && tcfg.is_first_output_group &&
+  t_rec_tick_data = xcfg.rec_outputs && tcfg.is_first_output &&
       (stage_rec_flags & (1 << SPINN_REC_TICK_DATA));
 
   t_rec_outputs = xcfg.rec_outputs && tcfg.output_grp &&
@@ -580,20 +579,22 @@ void var_init (uint reset_examples, uint reset_epochs_trained)
                - SPINN_SHORT_ACTIV_SHIFT);
     t_max_target = SPINN_SHORT_ACTIV_MIN_POS << (SPINN_ACTIV_SHIFT
                - SPINN_SHORT_ACTIV_SHIFT);
+  }
 
-    // no need to wait for previous value if first group
-    if (tcfg.is_first_output_group)
-    {
-      tf_init_crit = 1;
-      tf_crit_prev = TRUE;
-    }
-    else
-    {
-      tf_init_crit = 0;
-    }
-    tf_crit_rdy = tf_init_crit;
+  // check if expecting a previous criterion value
+  if (tcfg.crit_expected)
+  {
+    tf_crit_init = 0;
+  }
+  else
+  {
+    tf_crit_init = 1;
   }
 
+  // initialise flag and previous value
+  tf_crit_rdy = tf_crit_init;
+  tf_crit_prev = TRUE;
+
   // initialise processing thread flag
   tf_active = FALSE;
 
@@ -602,15 +603,11 @@ void var_init (uint reset_examples, uint reset_epochs_trained)
   t_pkt_queue.tail = 0;
 
   // initialise packet keys
-  //NOTE: colour is initialised to 0
-  for (uint p = 0; p < tcfg.partitions; p++)
-  {
-    t_fwdKey[p] = rt[FWDT + p] | SPINN_PHASE_KEY (SPINN_FORWARD);
-  }
-
+  //NOTE: colour is implicitly initialised to 0
+  fwdKey = rt[FWD] | SPINN_PHASE_KEY (SPINN_FORWARD);
   bkpKey = rt[BKP] | SPINN_PHASE_KEY (SPINN_BACKPROP);
 
-  if (tcfg.is_last_output_group)
+  if (tcfg.is_last_output)
   {
     // tick stop key
     tf_stop_key = rt[STP] | SPINN_STOP_KEY | SPINN_PHASE_KEY (SPINN_FORWARD);
@@ -642,14 +639,29 @@ void var_init (uint reset_examples, uint reset_epochs_trained)
   stn_recv = 0;  // network_stop packets received
   wrng_phs = 0;  // packets received in wrong phase
   tot_tick = 0;  // total number of ticks executed
+  // ------------------------------------------------------------------------
 #endif
 
 #if defined(DEBUG) && defined(DEBUG_THRDS)
+  // ------------------------------------------------------------------------
+  // THREAD DEBUG variables
+  // ------------------------------------------------------------------------
   wrng_pth = 0;  // unexpected processing thread
   wrng_cth = 0;  // unexpected comms thread
   wrng_sth = 0;  // unexpected stop thread
-#endif
   // ------------------------------------------------------------------------
+#endif
+
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+prf_fwd_min = SPINN_PROFILER_START;  // minimum FORWARD processing time
+prf_fwd_max = 0;                     // maximum FORWARD processing time
+prf_bkp_min = SPINN_PROFILER_START;  // minimum BACKPROP processing time
+prf_bkp_max = 0;                     // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
 }
 // ------------------------------------------------------------------------
 
@@ -777,22 +789,18 @@ void stage_done (uint ec, uint key)
   io_printf (IO_BUF, "total sent:%d\n", pkt_sent);
   io_printf (IO_BUF, "recv: fwd:%d bkp:%d\n", recv_fwd, recv_bkp);
   io_printf (IO_BUF, "sent: fwd:%d bkp:%d\n", sent_fwd, sent_bkp);
-  if (tcfg.is_first_output_group)
+  io_printf (IO_BUF, "crit sent:%d\n", crt_sent);
+  if (tcfg.is_last_sgrp)
   {
-    io_printf (IO_BUF, "criterion recv: first\n");
+    io_printf (IO_BUF, "crit recv:%d\n", crt_recv);
   }
-  else
-  {
-  io_printf (IO_BUF, "criterion recv:%d\n", crt_recv);
-  }
-  if (tcfg.is_last_output_group)
+  if (tcfg.is_last_output)
   {
     io_printf (IO_BUF, "stop sent:%d\n", stp_sent);
     io_printf (IO_BUF, "stpn sent:%d\n", stn_sent);
   }
   else
   {
-    io_printf (IO_BUF, "criterion sent:%d\n", crt_sent);
     io_printf (IO_BUF, "stop recv:%d\n", stp_recv);
     io_printf (IO_BUF, "stpn recv:%d\n", stn_recv);
   }
@@ -805,6 +813,17 @@ void stage_done (uint ec, uint key)
   if (wrng_sth) io_printf (IO_BUF, "wrong sth:%d\n", wrng_sth);
 #endif
 
+#ifdef PROFILE
+  // report PROFILER values
+  io_printf (IO_BUF, "min fwd proc:%u\n", prf_fwd_min);
+  io_printf (IO_BUF, "max fwd proc:%u\n", prf_fwd_max);
+  if (xcfg.training)
+  {
+    io_printf (IO_BUF, "min bkp proc:%u\n", prf_bkp_min);
+    io_printf (IO_BUF, "max bkp proc:%u\n", prf_bkp_max);
+  }
+#endif
+
 #ifdef DEBUG
   // close log,
   io_printf (IO_BUF, "stopping stage %u\n", xcfg.stage_id);
diff --git a/c_code/init_w.c b/c_code/init_w.c
index c30f44c..c32638b 100644
--- a/c_code/init_w.c
+++ b/c_code/init_w.c
@@ -26,6 +26,11 @@ uint cfg_init (void)
   io_printf (IO_BUF, "weight\n");
 #endif
 
+#ifdef PROFILE
+  // configure timer 2 for profiling
+  tc[T2_CONTROL] = SPINN_PROFILER_CFG;
+#endif
+
   // read the data specification header
   data_specification_metadata_t * data =
           data_specification_get_data_address();
@@ -93,8 +98,6 @@ uint cfg_init (void)
 #ifdef DEBUG_CFG
   io_printf (IO_BUF, "nr: %d\n", wcfg.num_rows);
   io_printf (IO_BUF, "nc: %d\n", wcfg.num_cols);
-  io_printf (IO_BUF, "rb: %d\n", wcfg.row_blk);
-  io_printf (IO_BUF, "cb: %d\n", wcfg.col_blk);
   io_printf (IO_BUF, "lr: %k\n", wcfg.learningRate);
   io_printf (IO_BUF, "wd: %k\n", wcfg.weightDecay);
   io_printf (IO_BUF, "mm: %k\n", wcfg.momentum);
@@ -324,12 +327,10 @@ void var_init (uint init_weights, uint reset_examples)
   wb_update_func = w_update_procs[xcfg.update_function];
 
   // initialise packet keys
-  //NOTE: colour is initialised to 0.
-  fwdKey = rt[FWD] | SPINN_PHASE_KEY(SPINN_FORWARD)
-      | SPINN_BLOCK_KEY(wcfg.col_blk);
-  bkpKey = rt[BKP] | SPINN_PHASE_KEY(SPINN_BACKPROP)
-      | SPINN_BLOCK_KEY(wcfg.row_blk);
-  ldsaKey = rt[LDS] | SPINN_LDSA_KEY | SPINN_PHASE_KEY(SPINN_BACKPROP);
+  //NOTE: colour is implicitly initialised to 0
+  fwdKey = rt[FWD] | SPINN_PHASE_KEY(SPINN_FORWARD);
+  bkpKey = rt[BKP] | SPINN_PHASE_KEY(SPINN_BACKPROP);
+  ldsKey = rt[LDS] | SPINN_LDSA_KEY | SPINN_PHASE_KEY(SPINN_BACKPROP);
 
 #ifdef DEBUG
   // ------------------------------------------------------------------------
@@ -347,8 +348,8 @@ void var_init (uint init_weights, uint reset_examples)
   stp_sent = 0;  // stop packets sent
   stp_recv = 0;  // stop packets received
   stn_recv = 0;  // network_stop packets received
-  lda_sent = 0;  // partial link_delta packets sent
-  ldr_recv = 0;  // link_delta packets received
+  lds_sent = 0;  // link_delta packets sent
+  lds_recv = 0;  // link_delta packets received
   wrng_fph = 0;  // FORWARD packets received in wrong phase
   wrng_bph = 0;  // BACKPROP received in wrong phase
   wght_ups = 0;  // number of weight updates done
@@ -358,6 +359,17 @@ void var_init (uint init_weights, uint reset_examples)
   tot_tick = 0;  // total number of ticks executed
   // ------------------------------------------------------------------------
 #endif
+
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+prf_fwd_min = SPINN_PROFILER_START;  // minimum FORWARD processing time
+prf_fwd_max = 0;                     // maximum FORWARD processing time
+prf_bkp_min = SPINN_PROFILER_START;  // minimum BACKPROP processing time
+prf_bkp_max = 0;                     // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
 }
 // ------------------------------------------------------------------------
 
@@ -472,8 +484,8 @@ void stage_done (uint ec, uint key)
   io_printf (IO_BUF, "recv: fwd:%d bkp:%d\n", recv_fwd, recv_bkp);
   io_printf (IO_BUF, "sent: fwd:%d bkp:%d\n", sent_fwd, sent_bkp);
   io_printf (IO_BUF, "unused recv: fwd:%d bkp:%d\n", pkt_fwbk, pkt_bwbk);
-  io_printf (IO_BUF, "ldsa sent:%d\n", lda_sent);
-  io_printf (IO_BUF, "ldsr recv:%d\n", ldr_recv);
+  io_printf (IO_BUF, "lds sent:%d\n", lds_sent);
+  io_printf (IO_BUF, "lds recv:%d\n", lds_recv);
   io_printf (IO_BUF, "stop recv:%d\n", stp_recv);
   io_printf (IO_BUF, "stpn recv:%d\n", stn_recv);
   io_printf (IO_BUF, "sync recv:%d\n", spk_recv);
@@ -486,6 +498,17 @@ void stage_done (uint ec, uint key)
   io_printf (IO_BUF, "weight updates:%d\n", wght_ups);
 #endif
 
+#ifdef PROFILE
+  // report PROFILER values
+  io_printf (IO_BUF, "min fwd proc:%u\n", prf_fwd_min);
+  io_printf (IO_BUF, "max fwd proc:%u\n", prf_fwd_max);
+  if (xcfg.training)
+  {
+    io_printf (IO_BUF, "min bkp proc:%u\n", prf_bkp_min);
+    io_printf (IO_BUF, "max bkp proc:%u\n", prf_bkp_max);
+  }
+#endif
+
 #ifdef DEBUG
   // close log,
   io_printf (IO_BUF, "stopping stage %u\n", xcfg.stage_id);
diff --git a/c_code/input.c b/c_code/input.c
index 5b01d6f..b821ec6 100644
--- a/c_code/input.c
+++ b/c_code/input.c
@@ -116,7 +116,7 @@ long_delta_t   * i_deltas;          // deltas computed in current tick
 pkt_queue_t      i_pkt_queue;       // queue to hold received packets
 uchar            i_active;          // processing packets from queue?
 
-long_net_t     * i_last_integr_net; //last INTEGRATOR output value
+long_net_t     * i_last_integr_net;   //last INTEGRATOR output value
 long_delta_t   * i_last_integr_delta; //last INTEGRATOR delta value
 
 uint             i_it_idx;          // index into current inputs/targets
@@ -131,10 +131,8 @@ uint             if_thrds_pend;     // thread semaphore
 long_delta_t   * ib_init_delta;     // initial delta value for every tick
 scoreboard_t     ib_done;           // current tick delta computation done
 
-uint           * i_bkpKey;          // i cores have one bkpKey per partition
-
 // history arrays
-long_net_t     * i_net_history;   //sdram pointer where to store input history
+long_net_t     * i_net_history;     //sdram pointer where to store input history
 // ------------------------------------------------------------------------
 
 
@@ -160,6 +158,18 @@ uint tot_tick;  // total number of ticks executed
 #endif
 
 
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+uint prf_fwd_min;  // minimum FORWARD processing time
+uint prf_fwd_max;  // maximum FORWARD processing time
+uint prf_bkp_min;  // minimum BACKPROP processing time
+uint prf_bkp_max;  // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
+
+
 // ------------------------------------------------------------------------
 // timer callback: check that there has been progress in execution.
 // If no progress has been made terminate with SPINN_TIMEOUT_EXIT code.
diff --git a/c_code/mlp_externs.h b/c_code/mlp_externs.h
index e0edeb8..c58ff76 100644
--- a/c_code/mlp_externs.h
+++ b/c_code/mlp_externs.h
@@ -12,9 +12,7 @@ extern uint coreID;               // 5-bit virtual core ID
 
 extern uint fwdKey;               // packet ID for FORWARD-phase data
 extern uint bkpKey;               // packet ID for BACKPROP-phase data
-extern uint ldsaKey;              // packet ID for link delta summation accumulators
-extern uint ldstKey;              // packet ID for link delta summation totals
-extern uint ldsrKey;              // packet ID for link delta summation reports
+extern uint ldsKey;               // packet ID for link delta summation
 extern uint fdsKey;               // packet ID for FORWARD synchronisation
 
 extern uint32_t stage_step;       // current stage step
@@ -92,19 +90,18 @@ extern activation_t     * w_output_history;
 // ------------------------------------------------------------------------
 // sum core variables
 // ------------------------------------------------------------------------
-extern long_net_t     * s_nets[2];     // unit nets computed in current tick
-extern long_error_t   * s_errors[2];   // errors computed in current tick
-extern pkt_queue_t      s_pkt_queue;   // queue to hold received packets
-extern uchar            s_active;      // processing packets from queue?
-extern lds_t            s_lds_part;    // partial link delta sum
-extern scoreboard_t   * sf_arrived[2]; // keep count of expected net b-d-p
-extern scoreboard_t     sf_done;       // current tick net computation done
-extern uint             sf_thrds_pend; // thread semaphore
-extern scoreboard_t   * sb_arrived[2]; // keep count of expected error b-d-p
-extern scoreboard_t     sb_done;       // current tick error computation done
-extern uint             sb_thrds_pend; // thread semaphore
-extern scoreboard_t     s_ldsa_arrived; // keep count of the number of partial link delta sums
-extern scoreboard_t     s_ldst_arrived; // keep count of the number of link delta sum totals
+extern long_net_t     * s_nets[2];      // unit nets computed in current tick
+extern long_error_t   * s_errors[2];    // errors computed in current tick
+extern pkt_queue_t      s_pkt_queue;    // queue to hold received packets
+extern uchar            s_active;       // processing packets from queue?
+extern lds_t            s_lds_part;     // partial link delta sum
+extern scoreboard_t   * sf_arrived[2];  // keep count of expected net b-d-p
+extern scoreboard_t     sf_done;        // current tick net computation done
+extern uint             sf_thrds_pend;  // thread semaphore
+extern scoreboard_t   * sb_arrived[2];  // keep count of expected error b-d-p
+extern scoreboard_t     sb_done;        // current tick error computation done
+extern uint             sb_thrds_pend;  // thread semaphore
+extern scoreboard_t     s_lds_arrived;  // keep count of received link delta sums
 // ------------------------------------------------------------------------
 
 // ------------------------------------------------------------------------
@@ -129,10 +126,8 @@ extern scoreboard_t     ib_done;       // current tick delta computation done
 extern long_net_t     * i_last_integr_net;   //last INTEGRATOR output value
 extern long_delta_t   * i_last_integr_delta; //last INTEGRATOR delta value
 
-extern uint           * i_bkpKey;      // i cores have one bkpKey per partition
-
 // history arrays
-extern long_net_t      * i_net_history; //sdram pointer where to store input history
+extern long_net_t     * i_net_history; //sdram pointer where to store input history
 // ------------------------------------------------------------------------
 
 // ------------------------------------------------------------------------
@@ -160,7 +155,8 @@ extern uchar            tf_active;     // processing FWD-phase packet queue?
 extern scoreboard_t     tf_arrived;    // keep count of expected nets
 extern uint             tf_thrds_pend; // thread semaphore
 extern uchar            tf_crit_prev;  // criterion value received
-extern uchar            tf_init_crit;  // criterion init value
+extern scoreboard_t     tf_crit_arrived;  // keep count of expected crit pkts
+extern uchar            tf_crit_init;  // criterion init value
 extern uchar            tf_crit_rdy;   // criterion can be forwarded
 extern uchar            tf_stop_crit;  // stop criterion met?
 extern uchar            tf_group_crit;     // stop criterion met for all groups?
@@ -188,18 +184,16 @@ extern uchar            t_rec_results;    // record test results to SDRAM
 extern uchar            t_rec_tick_data;  // record tick data to SDRAM
 extern uchar            t_rec_step_updt;  // update recording step
 
-extern uint           * t_fwdKey;      // t cores have one fwdKey per partition
-
 // history arrays
 extern net_t          * t_net_history;
 extern activation_t   * t_output_history;
 extern long_deriv_t   * t_output_deriv_history;
 // ------------------------------------------------------------------------
 
+#ifdef DEBUG
 // ------------------------------------------------------------------------
 // DEBUG variables
 // ------------------------------------------------------------------------
-#ifdef DEBUG
 extern uint pkt_sent;  // total packets sent
 extern uint sent_fwd;  // packets sent in FORWARD phase
 extern uint sent_bkp;  // packets sent in BACKPROP phase
@@ -216,12 +210,8 @@ extern uint stp_sent;  // stop packets sent
 extern uint stp_recv;  // stop packets received
 extern uint stn_sent;  // network_stop packets sent
 extern uint stn_recv;  // network_stop packets received
-extern uint lda_sent;  // partial link_delta packets sent
-extern uint lda_recv;  // partial link_delta packets received
-extern uint ldt_sent;  // total link_delta packets sent
-extern uint ldt_recv;  // total link_delta packets received
-extern uint ldr_sent;  // link_delta packets sent
-extern uint ldr_recv;  // link_delta packets received
+extern uint lds_sent;  // link_delta packets sent
+extern uint lds_recv;  // link_delta packets received
 extern uint tot_tick;  // total number of ticks executed
 extern uint wght_ups;  // number of weight updates done
 extern uint wrng_phs;  // packets received in wrong phase
@@ -230,7 +220,20 @@ extern uint wrng_bph;  // BACKPROP packets received in wrong phase
 extern uint wrng_pth;  // unexpected processing thread
 extern uint wrng_cth;  // unexpected comms thread
 extern uint wrng_sth;  // unexpected stop thread
+// ------------------------------------------------------------------------
 #endif
+
+
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+extern uint prf_fwd_min;  // minimum FORWARD processing time
+extern uint prf_fwd_max;  // maximum FORWARD processing time
+extern uint prf_bkp_min;  // minimum BACKPROP processing time
+extern uint prf_bkp_max;  // maximum BACKPROP processing time
 // ------------------------------------------------------------------------
+#endif
+
 
 #endif
diff --git a/c_code/mlp_params.h b/c_code/mlp_params.h
index b1be49a..c08b327 100644
--- a/c_code/mlp_params.h
+++ b/c_code/mlp_params.h
@@ -12,6 +12,16 @@
 #define SPINN_TIMER_TICK_PERIOD  1000000
 #define SPINN_PRINT_SHIFT        16
 
+
+// ------------------------------------------------------------------------
+// profiler constants
+// ------------------------------------------------------------------------
+// configure timer2 for profiling: enabled, free running,
+// interrupt disabled, no pre-scale and 32-bit one-shot mode
+#define SPINN_PROFILER_CFG       0x83
+#define SPINN_PROFILER_START     0xffffffff
+
+
 // ------------------------------------------------------------------------
 // neural net constants
 // ------------------------------------------------------------------------
@@ -89,9 +99,7 @@
 // packet type keys
 #define SPINN_DATA_KEY       0x00000000
 #define SPINN_SYNC_KEY       0x00001000
-#define SPINN_LDST_KEY       0x00002000
 #define SPINN_LDSA_KEY       0x00003000
-#define SPINN_LDSR_KEY       0x00004000
 #define SPINN_CRIT_KEY       0x00005000
 #define SPINN_STPN_KEY       0x00006000
 #define SPINN_STOP_KEY       0x00007000
@@ -125,17 +133,6 @@
 // ------------------------------------------------------------------------
 
 
-// ------------------------------------------------------------------------
-// core function types
-// ------------------------------------------------------------------------
-#define SPINN_WEIGHT_PROC    0x0
-#define SPINN_SUM_PROC       0x1
-#define SPINN_THRESHOLD_PROC 0x2
-#define SPINN_INPUT_PROC     0x3
-#define SPINN_UNUSED_PROC    0x4
-// ------------------------------------------------------------------------
-
-
 // ------------------------------------------------------------------------
 // implementation parameters
 // ------------------------------------------------------------------------
@@ -154,8 +151,6 @@
 #define SPINN_THRD_COMS      ((SPINN_THRD_PROC) << 1)
 #define SPINN_THRD_STOP      ((SPINN_THRD_COMS) << 1)
 #define SPINN_THRD_LDSA      ((SPINN_THRD_STOP) << 1)
-#define SPINN_THRD_LDST      ((SPINN_THRD_LDSA) << 1)
-#define SPINN_THRD_LDSR      (SPINN_THRD_LDSA)
 
 #define SPINN_WF_THRDS       (SPINN_THRD_PROC | SPINN_THRD_COMS | SPINN_THRD_STOP)
 #define SPINN_WB_THRDS       (SPINN_THRD_PROC)
@@ -207,12 +202,12 @@
 // ------------------------------------------------------------------------
 // EXIT codes -- error
 // ------------------------------------------------------------------------
-#define SPINN_NO_ERROR         0
-#define SPINN_MEM_UNAVAIL      1
-#define SPINN_QUEUE_FULL       2
-#define SPINN_TIMEOUT_EXIT     3
-#define SPINN_UNXPD_PKT        4
-#define SPINN_CFG_UNAVAIL      5
+#define SPINN_NO_ERROR       0
+#define SPINN_MEM_UNAVAIL    1
+#define SPINN_QUEUE_FULL     2
+#define SPINN_TIMEOUT_EXIT   3
+#define SPINN_UNXPD_PKT      4
+#define SPINN_CFG_UNAVAIL    5
 // ------------------------------------------------------------------------
 
 #endif
diff --git a/c_code/mlp_types.h b/c_code/mlp_types.h
index 97fa723..37355c0 100644
--- a/c_code/mlp_types.h
+++ b/c_code/mlp_types.h
@@ -24,16 +24,12 @@ enum MLPRecordings {
   TICK_DATA    = 2
 };
 
-// t cores can have more than one FWD key (due to partitions)
-// i cores can have more than one BKP key (due to partitions)
 enum MLPKeys {
   FWD  = 0,
   BKP  = 1,
   FDS  = 2,
   STP  = 3,
-  LDS  = 4,
-  FWDT = 5,
-  BKPI = 5
+  LDS  = 4
 };
 
 
@@ -223,7 +219,6 @@ typedef struct network_conf     // MLP network configuration
   uchar net_type;               // type of neural net
   uint  ticks_per_int;          // number of ticks per interval
   uint  global_max_ticks;       // max number of ticks across all the examples
-  uint  num_write_blks;         // number of groups that write outputs
 } network_conf_t;
 // ------------------------------------------------------------------------
 
@@ -232,8 +227,8 @@ typedef struct network_conf     // MLP network configuration
 // weight core configuration
 // ------------------------------------------------------------------------
 // The neural net is represented by a weight matrix.
-// The matrix is divided into num_rblks x num_cblk weight blocks
-// and every weight core computes for one of these blocks.
+// The matrix is divided into a number of weight blocks and each
+// weight core gets assigned one of these blocks for computation.
 // Each block is associated with a single projection, i.e., it contains
 // connection weights associated with a single origin group and a single
 // destination group (which can be the same in recurrent networks).
@@ -245,8 +240,6 @@ typedef struct w_conf             // weight core configuration
 {
   uint         num_rows;          // rows in this core's block
   uint         num_cols;          // columns in this core's block
-  uint         row_blk;           // this core's row block number
-  uint         col_blk;           // this core's column block number
   scoreboard_t sync_expected;     // num of expected sync packets
   activation_t initOutput;        // initial value for unit outputs
   short_fpreal learningRate;      // network learning rate
@@ -267,9 +260,9 @@ typedef struct s_conf               // sum core configuration
   uint         num_units;           // this core's number of units
   scoreboard_t fwd_expected;        // num of expected partial nets
   scoreboard_t bkp_expected;        // num of expected partial errors
-  scoreboard_t ldsa_expected;       // num of expected partial link delta sums
-  scoreboard_t ldst_expected;       // num of expected link delta sum totals
+  scoreboard_t lds_expected;        // num of expected partial link delta sums
   uchar        is_first_group;      // is this the first group in the network?
+  uchar        is_tree_root;        // is this the root of an s_core tree?
 } s_conf_t;
 // ------------------------------------------------------------------------
 
@@ -285,7 +278,6 @@ typedef struct i_conf                // input core configuration
   uchar         output_grp;          // is this an OUTPUT group?
   uchar         input_grp;           // is this an INPUT group?
   uint          num_units;           // this core's number of units
-  uint          partitions;          // this groups's number of partitions
   uint          num_in_procs;        // number of input (net) comp procedures
   uint          procs_list[SPINN_NUM_IN_PROCS];
   uchar         in_integr_en;        // input INTEGRATOR in use
@@ -310,9 +302,8 @@ typedef struct t_conf                  // threshold core configuration
 {
   uchar         output_grp;            // is this an OUTPUT group?
   uchar         input_grp;             // is this an INPUT group?
+  uchar         is_last_sgrp;          // is last subgroup of the group?
   uint          num_units;             // this core's number of units
-  uint          partitions;            // this group's number of partitions
-  uint          write_blk;             // this core's write block
   uchar         hard_clamp_en;         // HARD CLAMP in use
   uchar         out_integr_en;         // output INTEGRATOR in use
   fpreal        out_integr_dt;         // integration time const for input integr
@@ -322,9 +313,10 @@ typedef struct t_conf                  // threshold core configuration
   activation_t  initOutput;            // initial value for unit outputs
   error_t       tst_group_criterion;   // test-mode convergence criterion value
   error_t       trn_group_criterion;   // train-mode convergence criterion value
+  uint          crit_expected;         // num of expected partial crit pkts
   uchar         criterion_function;    // function to eval convergence criterion
-  uchar         is_first_output_group; // is this the first of the output groups
-  uchar         is_last_output_group;  // is this the last of the output groups
+  uchar         is_first_output;       // is this the first output subgroup
+  uchar         is_last_output;        // is this the last output subgroup
   uchar         error_function;        // error function used for BACKPROP
 } t_conf_t;
 // ------------------------------------------------------------------------
diff --git a/c_code/process_i.c b/c_code/process_i.c
index 7a7ccbb..40a1535 100644
--- a/c_code/process_i.c
+++ b/c_code/process_i.c
@@ -27,6 +27,11 @@ void if_process (uint key, uint payload)
     wrng_phs++;
 #endif
 
+#ifdef PROFILE
+  // start profiler,
+  tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
   // get net index: mask out block, phase and colour data,
   uint inx = key & SPINN_NET_MASK;
 
@@ -64,6 +69,13 @@ void if_process (uint key, uint payload)
   // mark net as done,
   if_done++;
 
+#ifdef PROFILE
+  // update profiler values,
+  uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+  if (cnt < prf_fwd_min) prf_fwd_min = cnt;
+  if (cnt > prf_fwd_max) prf_fwd_max = cnt;
+#endif
+
   // and check if all nets done
   if (if_done == icfg.num_units)
   {
@@ -114,6 +126,11 @@ void ib_process (uint key, uint payload)
     wrng_phs++;
 #endif
 
+#ifdef PROFILE
+  // start profiler,
+  tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
   // get delta index: mask out block, phase and colour data,
   uint inx = key & SPINN_DELTA_MASK;
 
@@ -145,13 +162,20 @@ void ib_process (uint key, uint payload)
   }
 
   // incorporate delta index to the packet key and send,
-  while (!spin1_send_mc_packet ((i_bkpKey[inx >> SPINN_BLOCK_SHIFT] | inx), delta, WITH_PAYLOAD));
+  while (!spin1_send_mc_packet ((bkpKey | inx), delta, WITH_PAYLOAD));
 
 #ifdef DEBUG
   pkt_sent++;
   sent_bkp++;
 #endif
 
+#ifdef PROFILE
+  // update profiler values,
+  uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+  if (cnt < prf_bkp_min) prf_bkp_min = cnt;
+  if (cnt > prf_bkp_max) prf_bkp_max = cnt;
+#endif
+
   // mark delta as done,
   ib_done++;
 
diff --git a/c_code/process_s.c b/c_code/process_s.c
index 6454c45..5255018 100644
--- a/c_code/process_s.c
+++ b/c_code/process_s.c
@@ -27,11 +27,17 @@ void sf_process (uint key, uint payload)
     wrng_phs++;
 #endif
 
+#ifdef PROFILE
+  // start profiler,
+  tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
   // get net index: mask out block and phase data,
   uint inx = key & SPINN_NET_MASK;
 
   // get error colour: mask out block, phase and net index data,
-  uint clr = (key & SPINN_COLOUR_MASK) >> SPINN_COLOUR_SHIFT;
+  uint pkt_clr = key & SPINN_COLOUR_MASK;
+  uint clr = pkt_clr >> SPINN_COLOUR_SHIFT;
 
   // accumulate new net b-d-p,
   s_nets[clr][inx] += (long_net_t) ((net_t) payload);
@@ -39,6 +45,13 @@ void sf_process (uint key, uint payload)
   // mark net b-d-p as arrived,
   sf_arrived[clr][inx]++;
 
+#ifdef PROFILE
+  // update profiler values,
+  uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+  if (cnt < prf_fwd_min) prf_fwd_min = cnt;
+  if (cnt > prf_fwd_max) prf_fwd_max = cnt;
+#endif
+
   // and check if dot product complete to compute net
   if (sf_arrived[clr][inx] == scfg.fwd_expected)
   {
@@ -58,8 +71,9 @@ void sf_process (uint key, uint payload)
       net_tmp = (net_t) s_nets[clr][inx];
     }
 
-    // incorporate net index to the packet key and send,
-    while (!spin1_send_mc_packet ((fwdKey | inx), net_tmp, WITH_PAYLOAD));
+    // incorporate colour and net index to the packet key and send,
+    while (!spin1_send_mc_packet ((fwdKey | pkt_clr | inx),
+        net_tmp, WITH_PAYLOAD));
 
 #ifdef DEBUG
     pkt_sent++;
@@ -124,11 +138,17 @@ void sb_process (uint key, uint payload)
     wrng_phs++;
 #endif
 
+#ifdef PROFILE
+  // start profiler,
+  tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
   // get error index: mask out block, phase and colour data,
   uint inx = key & SPINN_ERROR_MASK;
 
   // get error colour: mask out block, phase and net index data,
-  uint clr = (key & SPINN_COLOUR_MASK) >> SPINN_COLOUR_SHIFT;
+  uint pkt_clr = key & SPINN_COLOUR_MASK;
+  uint clr = pkt_clr >> SPINN_COLOUR_SHIFT;
 
   // accumulate new error b-d-p,
   s_errors[clr][inx] += (error_t) payload;
@@ -136,6 +156,13 @@ void sb_process (uint key, uint payload)
   // mark error b-d-p as arrived,
   sb_arrived[clr][inx]++;
 
+#ifdef PROFILE
+  // update profiler values,
+  uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+  if (cnt < prf_bkp_min) prf_bkp_min = cnt;
+  if (cnt > prf_bkp_max) prf_bkp_max = cnt;
+#endif
+
   // and check if error complete to send to next stage
   if (sb_arrived[clr][inx] == scfg.bkp_expected)
   {
@@ -160,8 +187,9 @@ void sb_process (uint key, uint payload)
     }
 */
 
-    // incorporate error index to the packet key and send,
-    while (!spin1_send_mc_packet ((bkpKey | inx), error, WITH_PAYLOAD));
+    // incorporate colour and error index to the packet key and send,
+    while (!spin1_send_mc_packet ((bkpKey | pkt_clr | inx),
+        error, WITH_PAYLOAD));
 
 #ifdef DEBUG
     pkt_sent++;
@@ -193,6 +221,8 @@ void sb_process (uint key, uint payload)
       if (sb_thrds_pend == SPINN_THRD_PROC)
       {
         // if done initialise semaphore:
+        sb_thrds_pend = SPINN_SB_THRDS;
+
         // if we are using Doug's Momentum, and we have reached the end of the
         // epoch (i.e. we are on the last example, and are about to move on to
         // the last tick, we need have to wait for the partial link delta sums
@@ -202,16 +232,7 @@ void sb_process (uint key, uint payload)
             && example_cnt == (xcfg.num_examples - 1)
             && tick == SPINN_SB_END_TICK + 1)
         {
-          // if this s core relates to the first group in the network, then we
-          // also need to wait for the link delta sum totals
-          if (scfg.is_first_group)
-          {
-            sb_thrds_pend = SPINN_SB_THRDS | SPINN_THRD_LDSA | SPINN_THRD_LDST;
-          }
-          else
-          {
-            sb_thrds_pend = SPINN_SB_THRDS | SPINN_THRD_LDSA;
-          }
+          sb_thrds_pend = SPINN_SB_THRDS | SPINN_THRD_LDSA;
         }
 
         // restore interrupts after flag access,
@@ -385,14 +406,6 @@ void s_advance_example (void)
 
     // reset example count for next epoch,
     example_cnt = 0;
-
-    // and reset the partial link delta sum
-    if (xcfg.training)
-    {
-      s_lds_part = 0;
-      s_ldsa_arrived = 0;
-      s_ldst_arrived = 0;
-    }
   }
 
   // start from first event for next example,
@@ -400,11 +413,14 @@ void s_advance_example (void)
   num_events = ex[example_inx].num_events;
 
   // and send sync packet to allow next example to start
-  while (!spin1_send_mc_packet (fdsKey, 0, NO_PAYLOAD));
+  if (scfg.is_tree_root)
+  {
+    while (!spin1_send_mc_packet (fdsKey, 0, NO_PAYLOAD));
 
 #ifdef DEBUG
-  pkt_sent++;
-  spk_sent++;
+    pkt_sent++;
+    spk_sent++;
 #endif
+  }
 }
 // ------------------------------------------------------------------------
diff --git a/c_code/process_t.c b/c_code/process_t.c
index a27c892..a98fdda 100644
--- a/c_code/process_t.c
+++ b/c_code/process_t.c
@@ -35,6 +35,11 @@ void tf_process (uint key, uint payload)
     wrng_phs++;
 #endif
 
+#ifdef PROFILE
+  // start profiler
+  tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
   // get net index: mask out block, phase and colour data,
   uint inx = (key & SPINN_NET_MASK);
 
@@ -58,9 +63,8 @@ void tf_process (uint key, uint payload)
   }
 
   // send newly computed output to w cores,
-  while (!spin1_send_mc_packet ((t_fwdKey[inx >> SPINN_BLOCK_SHIFT] | inx),
-                                 (uint) t_outputs[inx],
-                                 WITH_PAYLOAD
+  while (!spin1_send_mc_packet ((fwdKey | inx), (uint) t_outputs[inx],
+                                WITH_PAYLOAD
                                )
         );
 
@@ -76,6 +80,13 @@ void tf_process (uint key, uint payload)
   // mark net as arrived,
   tf_arrived++;
 
+#ifdef PROFILE
+  // update profiler values
+  uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+  if (cnt < prf_fwd_min) prf_fwd_min = cnt;
+  if (cnt > prf_fwd_max) prf_fwd_max = cnt;
+#endif
+
   // and check if all nets arrived (i.e., all outputs done)
   if (tf_arrived == tcfg.num_units)
   {
@@ -103,65 +114,37 @@ void tf_process (uint key, uint payload)
     // access thread semaphore and flags with interrupts disabled,
     uint cpsr = spin1_int_disable ();
 
-    // and check if all other threads done
-    if (tcfg.output_grp)
-    {
-      // report processing thread done,
-      //NOTE: tick stop decision cannot have arrived!
-      tf_thrds_pend &= ~SPINN_THRD_PROC;
+    // report processing thread done,
+    //NOTE: tick stop decision cannot have arrived!
+    tf_thrds_pend &= ~SPINN_THRD_PROC;
 
-      // check if criterion value can be forwarded
-      if (tf_crit_rdy)
-      {
-        // initialise semaphore,
-        tf_crit_rdy = tf_init_crit;
+    // check if criterion value can be forwarded
+    if (tf_crit_rdy)
+    {
+      // initialise flag,
+      tf_crit_rdy = tf_crit_init;
 
-        // restore interrupts after flag access,
-        spin1_mode_restore (cpsr);
+      // restore interrupts after flag access,
+      spin1_mode_restore (cpsr);
 
-        // send (criterion/tick stop) packet,
-        tf_send_stop ();
+      // send (criterion/tick stop) packet,
+      tf_send_stop ();
 
-        // and advance tick if last group
-        //NOTE: last group does not get a stop decision
-        if (tcfg.is_last_output_group)
-        {
-          //TODO: check if need to schedule or can simply call
-          tf_advance_tick ();
-        }
-      }
-      else
+      // and advance tick if last group
+      //NOTE: last group does not get a stop decision
+      if (tcfg.is_last_output)
       {
-        // flag that local value is ready,
-        tf_crit_rdy = 1;
-
-        // and restore interrupts after flag access
-        spin1_mode_restore (cpsr);
+        //TODO: check if need to schedule or can simply call
+        tf_advance_tick ();
       }
     }
     else
     {
-      // check if all other threads done
-      if (tf_thrds_pend == SPINN_THRD_PROC)
-      {
-        // initialise semaphore,
-        tf_thrds_pend = SPINN_TF_THRDS;
-
-        // restore interrupts after flag access,
-        spin1_mode_restore (cpsr);
-
-        // and advance tick
-        //TODO: check if need to schedule or can simply call
-        tf_advance_tick ();
-      }
-      else
-      {
-        // if not done report processing thread done,
-        tf_thrds_pend &= ~SPINN_THRD_PROC;
+      // flag that local value is ready,
+      tf_crit_rdy = 1;
 
-        // and restore interrupts after flag access
-        spin1_mode_restore (cpsr);
-      }
+      // and restore interrupts after flag access
+      spin1_mode_restore (cpsr);
     }
   }
 }
@@ -185,6 +168,11 @@ void tb_process (uint unused0, uint unused1)
   //TODO: this needs checking!
   for (uint inx = 0; inx < tcfg.num_units; inx++)
   {
+#ifdef PROFILE
+    // start profiler
+    tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
     if (tcfg.output_grp)
     {
       // output groups:
@@ -220,6 +208,13 @@ void tb_process (uint unused0, uint unused1)
     pkt_sent++;
     sent_bkp++;
 #endif
+
+#ifdef PROFILE
+    // update profiler values
+    uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+    if (cnt < prf_bkp_min) prf_bkp_min = cnt;
+    if (cnt > prf_bkp_max) prf_bkp_max = cnt;
+#endif
   }
 
   // access thread semaphore with interrupts disabled
@@ -272,7 +267,7 @@ void tf_advance_tick (void)
   if (tick_stop)
   {
     // update event criterion
-    if (tcfg.is_last_output_group)
+    if (tcfg.is_last_output)
     {
       tf_event_crit = tf_event_crit && tf_group_crit && (ev_tick >= min_ticks);
       max_evt = evt;
@@ -324,7 +319,7 @@ void tb_advance_tick (uint unused0, uint unused1)
     t_switch_to_fw ();
 
     // update example criterion,
-    if (tcfg.is_last_output_group)
+    if (tcfg.is_last_output)
     {
       tf_example_crit = tf_example_crit && tf_event_crit && (max_evt >= num_events - 1);
     }
@@ -409,7 +404,7 @@ void tf_advance_event (void)
       t_it_idx += tcfg.num_units;
 
       // and update number of ticks for new event
-      if (tcfg.is_last_output_group)
+      if (tcfg.is_last_output)
       {
         // maximum
         if (ev[event_idx + evt].max_time != SPINN_FP_NaN)
@@ -469,7 +464,7 @@ void t_advance_example (void)
     epoch++;
 
     // check if stage done,
-    if (tcfg.is_last_output_group)
+    if (tcfg.is_last_output)
     {
       // report network stop decision,
       nsd = (!xcfg.training || (epoch >= xcfg.num_epochs)) ? 1 : tf_example_crit;
@@ -561,7 +556,7 @@ void t_advance_example (void)
   t_init_outputs ();
 
   // and update next event data
-  if (tcfg.is_last_output_group)
+  if (tcfg.is_last_output)
   {
     // update number of ticks for new event,
     // maximum
diff --git a/c_code/process_w.c b/c_code/process_w.c
index 42fc8fa..3e46a7b 100644
--- a/c_code/process_w.c
+++ b/c_code/process_w.c
@@ -32,6 +32,11 @@ void wf_process (uint unused0, uint unused1)
   // compute all net block dot-products and send them for accumulation,
   for (uint j = 0; j < wcfg.num_cols; j++)
   {
+#ifdef PROFILE
+    // start profiler
+    tc[T2_LOAD] = SPINN_PROFILER_START;
+#endif
+
     long_net_t net_part_tmp = 0;
 
     for (uint i = 0; i < wcfg.num_rows; i++)
@@ -60,6 +65,13 @@ void wf_process (uint unused0, uint unused1)
     pkt_sent++;
     sent_fwd++;
 #endif
+
+#ifdef PROFILE
+    // update profiler values
+    uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+    if (cnt < prf_fwd_min) prf_fwd_min = cnt;
+    if (cnt > prf_fwd_max) prf_fwd_max = cnt;
+#endif
   }
 
   // access thread semaphore with interrupts disabled
@@ -104,13 +116,11 @@ void wb_process (uint key, uint payload)
   recv_bkp++;
   if (phase == SPINN_FORWARD)
     wrng_bph++;
+#endif
 
-  uint blk = (key & SPINN_BLOCK_MASK) >> SPINN_BLOCK_SHIFT;
-  if (blk != wcfg.col_blk)
-  {
-    pkt_bwbk++;
-    return;
-  }
+#ifdef PROFILE
+  // start profiler
+  tc[T2_LOAD] = SPINN_PROFILER_START;
 #endif
 
   // get delta index: mask out phase and block data,
@@ -208,14 +218,21 @@ void wb_process (uint key, uint payload)
       lds_to_send = (lds_t) link_delta_sum;
 
     // and send partial link delta sum
-    while (!spin1_send_mc_packet (ldsaKey, (uint) lds_to_send, WITH_PAYLOAD));
+    while (!spin1_send_mc_packet (ldsKey, (uint) lds_to_send, WITH_PAYLOAD));
 
 #ifdef DEBUG
     pkt_sent++;
-    lda_sent++;
+    lds_sent++;
 #endif
   }
 
+#ifdef PROFILE
+  // update profiler values
+  uint cnt = SPINN_PROFILER_START - tc[T2_COUNT];
+  if (cnt < prf_bkp_min) prf_bkp_min = cnt;
+  if (cnt > prf_bkp_max) prf_bkp_max = cnt;
+#endif
+
   // if done with all deltas advance tick
   if (wb_arrived == wcfg.num_cols)
   {
@@ -242,7 +259,7 @@ void wb_process (uint key, uint payload)
           && example_cnt == (xcfg.num_examples - 1)
           && tick == SPINN_WB_END_TICK + 1)
       {
-        wb_thrds_pend = SPINN_WB_THRDS | SPINN_THRD_LDSR;
+        wb_thrds_pend = SPINN_WB_THRDS | SPINN_THRD_LDSA;
       }
 
       // restore interrupts after semaphore access,
diff --git a/c_code/sum.c b/c_code/sum.c
index b47a7c6..55a6602 100644
--- a/c_code/sum.c
+++ b/c_code/sum.c
@@ -26,8 +26,7 @@ uint coreID;               // 5-bit virtual core ID
 
 uint fwdKey;               // packet ID for FORWARD-phase data
 uint bkpKey;               // packet ID for BACKPROP-phase data
-uint ldstKey;              // packet ID for link delta summation totals
-uint ldsrKey;              // packet ID for link delta summation reports
+uint ldsKey;               // packet ID for link delta summation
 uint fdsKey;               // packet ID for FORWARD synchronisation
 
 uint32_t stage_step;       // current stage step
@@ -95,8 +94,7 @@ uint             sf_thrds_pend;     // thread semaphore
 scoreboard_t   * sb_arrived[2];     // keep count of expected error b-d-p
 scoreboard_t     sb_done;           // current tick error computation done
 uint             sb_thrds_pend;     // thread semaphore
-scoreboard_t     s_ldsa_arrived;    // keep count of the number of partial link delta sums
-scoreboard_t     s_ldst_arrived;    // keep count of the number of link delta sum totals
+scoreboard_t     s_lds_arrived;     // keep count of the number of partial link delta sums
 // ------------------------------------------------------------------------
 
 
@@ -114,10 +112,8 @@ uint spk_sent;  // sync packets sent
 uint stp_sent;  // stop packets sent
 uint stp_recv;  // stop packets received
 uint stn_recv;  // network_stop packets received
-uint lda_recv;  // partial link_delta packets received
-uint ldt_sent;  // total link_delta packets sent
-uint ldt_recv;  // total link_delta packets received
-uint ldr_sent;  // link_delta packets sent
+uint lds_sent;  // link_delta packets sent
+uint lds_recv;  // link_delta packets received
 uint wrng_phs;  // packets received in wrong phase
 uint wrng_pth;  // unexpected processing thread
 uint wrng_cth;  // unexpected comms thread
@@ -127,6 +123,18 @@ uint tot_tick;  // total number of ticks executed
 #endif
 
 
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+uint prf_fwd_min;  // minimum FORWARD processing time
+uint prf_fwd_max;  // maximum FORWARD processing time
+uint prf_bkp_min;  // minimum BACKPROP processing time
+uint prf_bkp_max;  // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
+
+
 // ------------------------------------------------------------------------
 // timer callback: check that there has been progress in execution.
 // If no progress has been made terminate with SPINN_TIMEOUT_EXIT code.
diff --git a/c_code/threshold.c b/c_code/threshold.c
index c68ee61..b46e7e9 100644
--- a/c_code/threshold.c
+++ b/c_code/threshold.c
@@ -153,7 +153,8 @@ uchar            tf_active;         // processing FWD-phase packet queue?
 scoreboard_t     tf_arrived;        // keep count of expected nets
 uint             tf_thrds_pend;     // thread semaphore
 uchar            tf_crit_prev;      // criterion value received
-uchar            tf_init_crit;      // criterion init value
+scoreboard_t     tf_crit_arrived;   // keep count of expected crit pkts
+uchar            tf_crit_init;      // criterion init value
 uchar            tf_crit_rdy;       // criterion can be forwarded
 uchar            tf_stop_crit;      // stop criterion met?
 uchar            tf_group_crit;     // stop criterion met for all groups?
@@ -186,8 +187,6 @@ uchar            t_rec_results;     // record test results to SDRAM
 uchar            t_rec_tick_data;   // record tick data to SDRAM
 uchar            t_rec_step_updt;   // update recording step
 
-uint           * t_fwdKey;          // t cores have one fwdKey per partition
-
 // history arrays
 net_t          * t_net_history;
 activation_t   * t_output_history;
@@ -220,6 +219,18 @@ uint tot_tick;  // total number of ticks executed
 #endif
 
 
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+uint prf_fwd_min;  // minimum FORWARD processing time
+uint prf_fwd_max;  // maximum FORWARD processing time
+uint prf_bkp_min;  // minimum BACKPROP processing time
+uint prf_bkp_max;  // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
+
+
 // ------------------------------------------------------------------------
 // timer callback: check that there has been progress in execution.
 // If no progress has been made terminate with SPINN_TIMEOUT_EXIT code.
diff --git a/c_code/weight.c b/c_code/weight.c
index fc16a2d..3aecf12 100644
--- a/c_code/weight.c
+++ b/c_code/weight.c
@@ -40,7 +40,7 @@ uint coreID;               // 5-bit virtual core ID
 
 uint fwdKey;               // packet ID for FORWARD-phase data
 uint bkpKey;               // packet ID for BACKPROP-phase data
-uint ldsaKey;              // packet ID for link delta summation
+uint ldsKey;               // packet ID for link delta summation
 
 uint32_t stage_step;       // current stage step
 uint32_t stage_num_steps;  // current stage number of steps
@@ -142,8 +142,8 @@ uint spk_recv;  // sync packets received
 uint stp_sent;  // stop packets sent
 uint stp_recv;  // stop packets received
 uint stn_recv;  // network_stop packets received
-uint lda_sent;  // partial link_delta packets sent
-uint ldr_recv;  // link_delta packets received
+uint lds_sent;  // link_delta packets sent
+uint lds_recv;  // link_delta packets received
 uint wrng_fph;  // FORWARD packets received in wrong phase
 uint wrng_bph;  // BACKPROP packets received in wrong phase
 uint wght_ups;  // number of weight updates done
@@ -155,6 +155,18 @@ uint tot_tick;  // total number of ticks executed
 #endif
 
 
+#ifdef PROFILE
+// ------------------------------------------------------------------------
+// PROFILER variables
+// ------------------------------------------------------------------------
+uint prf_fwd_min;  // minimum FORWARD processing time
+uint prf_fwd_max;  // maximum FORWARD processing time
+uint prf_bkp_min;  // minimum BACKPROP processing time
+uint prf_bkp_max;  // maximum BACKPROP processing time
+// ------------------------------------------------------------------------
+#endif
+
+
 // ------------------------------------------------------------------------
 // timer callback: check that there has been progress in execution.
 // If no progress has been made terminate with SPINN_TIMEOUT_EXIT code.
diff --git a/spinn_pdp2/input_vertex.py b/spinn_pdp2/input_vertex.py
index 315210a..a73c7b4 100644
--- a/spinn_pdp2/input_vertex.py
+++ b/spinn_pdp2/input_vertex.py
@@ -37,92 +37,94 @@ class InputVertex(
 
     def __init__(self,
                  network,
-                 group
+                 group,
+                 subgroup
                  ):
 
+        self._network  = network
+        self._group    = group
+        self._subgroup = subgroup
+
         super(InputVertex, self).__init__(
-            label = "i_core{}".format (group.id),
+            label = f"i_core{self.group.id}/{self.subgroup}",
             binary_name = "input.aplx",
             constraints = None)
 
         self._stage = 0
 
         # application-level data
-        self._network = network
-        self._group   = group
-        self._set_cfg = network._ex_set.set_config
-        self._ex_cfg  = network._ex_set.example_config
-        self._ev_cfg  = network._ex_set.event_config
+        self._set_cfg = self.network.ex_set.set_config
+        self._ex_cfg  = self.network.ex_set.example_config
+        self._ev_cfg  = self.network.ex_set.event_config
 
         # application parameters
-        self._in_integr_dt = 1.0 / network.ticks_per_int
-
-        # forward and backprop link partition names
-        self._fwd_link = "fwd_i{}".format (self.group.id)
-        self._bkp_link = []
-        for p in range (self._group.partitions):
-            self._bkp_link.append ("bkp_i{}_{}".format (self.group.id, p))
+        self._in_integr_dt = 1.0 / self.network.ticks_per_int
 
-        # reserve key space for every link
-        self._n_keys = MLPConstants.KEY_SPACE_SIZE
+        # forward and backprop link names
+        self._fwd_link = f"fwd_i{self.group.id}/{self.subgroup}"
+        self._bkp_link = f"bkp_i{self.group.id}/{self.subgroup}"
 
-        # configuration and data files
-        # find out the size of an integer!
-        _data_int = DataType.INT32
+        # input core-specific parameters
+        self._units = self.group.subunits[self.subgroup]
 
+        # configuration and data sizes
         # network configuration structure
-        self._N_NETWORK_CONFIGURATION_BYTES = \
-            len (self._network.network_config)
+        self._NETWORK_CONFIGURATION_BYTES = len (self.network.network_config)
 
         # core configuration structure
-        self._N_CORE_CONFIGURATION_BYTES = \
-            len (self.config)
+        self._CORE_CONFIGURATION_BYTES = len (self.config)
 
         # set configuration structure
-        self._N_EXAMPLE_SET_BYTES = \
-            len (self._set_cfg)
+        self._EXAMPLE_SET_BYTES = len (self._set_cfg)
 
         # list of example configurations
-        self._N_EXAMPLES_BYTES = \
-            len (self._ex_cfg) * len (self._ex_cfg[0])
+        self._EXAMPLES_BYTES = len (self._ex_cfg) * len (self._ex_cfg[0])
 
         # list of event configurations
-        self._N_EVENTS_BYTES = \
-            len (self._ev_cfg) * len (self._ev_cfg[0])
+        self._EVENTS_BYTES = len (self._ev_cfg) * len (self._ev_cfg[0])
 
-        # list of group inputs (empty if not an INPUT group)
-        self._N_INPUTS_BYTES = \
-            len (self._group.inputs) * _data_int.size
+        # list of subgroup inputs (empty if not an INPUT group)
+        if self.group.input_grp:
+            self._INPUTS_BYTES = ((len (self.group.inputs) // self.group.units) *
+                                  self._units * DataType.INT32.size)
+        else:
+            self._INPUTS_BYTES = 0
 
-        # keys are integers
-        # i cores require a different key for every group partition
-        self._N_KEYS_BYTES = _data_int.size * \
-            (MLPConstants.NUM_KEYS_REQ + self.group.partitions)
+        # list of routing keys
+        self._KEYS_BYTES = MLPConstants.NUM_KEYS_REQ * DataType.INT32.size
 
         # stage configuration structure
-        self._N_STAGE_CONFIGURATION_BYTES = len (self._network.stage_config)
+        self._STAGE_CONFIGURATION_BYTES = len (self.network.stage_config)
 
         # reserve SDRAM space used to store historic data
-        self._NET_HISTORY_BYTES = (MLPConstants.LONG_NET_SIZE // 8) * \
-            self.group.units * self._network.global_max_ticks
+        self._NET_HISTORY_BYTES = ((MLPConstants.LONG_NET_SIZE // 8) *
+            self._units * self.network.global_max_ticks)
 
 
         self._sdram_usage = (
-            self._N_NETWORK_CONFIGURATION_BYTES + \
-            self._N_CORE_CONFIGURATION_BYTES + \
-            self._N_EXAMPLE_SET_BYTES + \
-            self._N_EXAMPLES_BYTES + \
-            self._N_EVENTS_BYTES + \
-            self._N_INPUTS_BYTES + \
-            self._N_KEYS_BYTES + \
-            self._N_STAGE_CONFIGURATION_BYTES + \
+            self._NETWORK_CONFIGURATION_BYTES +
+            self._CORE_CONFIGURATION_BYTES +
+            self._EXAMPLE_SET_BYTES +
+            self._EXAMPLES_BYTES +
+            self._EVENTS_BYTES +
+            self._INPUTS_BYTES +
+            self._KEYS_BYTES +
+            self._STAGE_CONFIGURATION_BYTES +
             self._NET_HISTORY_BYTES
         )
 
+    @property
+    def network (self):
+        return self._network
+
     @property
     def group (self):
         return self._group
 
+    @property
+    def subgroup (self):
+        return self._subgroup
+
     @property
     def fwd_link (self):
         return self._fwd_link
@@ -141,7 +143,6 @@ def config (self):
               uchar         output_grp;
               uchar         input_grp;
               uint          num_units;
-              uint          partitions;
               uint          num_in_procs;
               uint          procs_list[SPINN_NUM_IN_PROCS];
               uchar         in_integr_en;
@@ -155,21 +156,21 @@ def config (self):
             explicit padding
         """
         # integration dt is an MLP fixed-point fpreal
-        in_integr_dt = int (self._in_integr_dt * (1 << MLPConstants.FPREAL_SHIFT))
+        in_integr_dt = int (self._in_integr_dt *
+                            (1 << MLPConstants.FPREAL_SHIFT))
 
         # soft_clamp_strength is an MLP fixed-point fpreal
-        soft_clamp_strength = int (self.group.soft_clamp_strength *\
-                           (1 << MLPConstants.FPREAL_SHIFT))
+        soft_clamp_strength = int (self.group.soft_clamp_strength *
+                                   (1 << MLPConstants.FPREAL_SHIFT))
 
         # init output is an MLP fixed-point activation_t
-        init_output = int (self.group.init_output *\
+        init_output = int (self.group.init_output *
                            (1 << MLPConstants.ACTIV_SHIFT))
 
-        return struct.pack ("<2B2x5IB3x4i",
+        return struct.pack ("<2B2x4IB3x4i",
                             self.group.output_grp,
                             self.group.input_grp,
-                            self.group.units,
-                            self.group.partitions,
+                            self._units,
                             self.group.num_in_procs,
                             self.group.in_procs_list[0].value,
                             self.group.in_procs_list[1].value,
@@ -191,7 +192,7 @@ def resources_required (self):
 
     @overrides (AbstractProvidesNKeysForPartition.get_n_keys_for_partition)
     def get_n_keys_for_partition (self, partition, graph_mapper):
-        return self._n_keys
+        return MLPConstants.KEY_SPACE_SIZE
 
 
     @overrides(MachineDataSpecableVertex.generate_machine_data_specification)
@@ -204,17 +205,17 @@ def generate_machine_data_specification(
 
         # Reserve and write the network configuration region
         spec.reserve_memory_region (MLPRegions.NETWORK.value,
-                                    self._N_NETWORK_CONFIGURATION_BYTES)
+                                    self._NETWORK_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.NETWORK.value)
 
         # write the network configuration into spec
-        for c in self._network.network_config:
+        for c in self.network.network_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         # Reserve and write the core configuration region
         spec.reserve_memory_region (MLPRegions.CORE.value,
-                                    self._N_CORE_CONFIGURATION_BYTES)
+                                    self._CORE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.CORE.value)
 
@@ -224,7 +225,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the example set region
         spec.reserve_memory_region (MLPRegions.EXAMPLE_SET.value,
-                                    self._N_EXAMPLE_SET_BYTES)
+                                    self._EXAMPLE_SET_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLE_SET.value)
 
@@ -234,7 +235,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the examples region
         spec.reserve_memory_region (MLPRegions.EXAMPLES.value,
-                                    self._N_EXAMPLES_BYTES)
+                                    self._EXAMPLES_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLES.value)
 
@@ -245,7 +246,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the events region
         spec.reserve_memory_region (MLPRegions.EVENTS.value,
-                                    self._N_EVENTS_BYTES)
+                                    self._EVENTS_BYTES)
 
         spec.switch_write_focus (MLPRegions.EVENTS.value)
 
@@ -255,25 +256,28 @@ def generate_machine_data_specification(
                 spec.write_value (c, data_type = DataType.UINT8)
 
         # Reserve and write the input data region (if INPUT group)
-        if self._N_INPUTS_BYTES != 0:
+        if self.group.input_grp:
             spec.reserve_memory_region (MLPRegions.INPUTS.value,
-                                        self._N_INPUTS_BYTES)
+                                        self._INPUTS_BYTES)
 
             spec.switch_write_focus (MLPRegions.INPUTS.value)
 
             # write inputs to spec
-            for _i in self._group.inputs:
-                # inputs are MLP fixed-point activation_t
-                #NOTE: check for absent or NaN
-                if (_i is None) or (_i != _i):
-                    _inp = MLPConstants.ACTIV_NaN
-                else:
-                    _inp = int (_i * (1 << MLPConstants.ACTIV_SHIFT))
-                spec.write_value (_inp, data_type = DataType.UINT32)
+            us = self.subgroup * MLPConstants.MAX_SUBGROUP_UNITS
+            for _ in range (len (self.group.inputs) // self.group.units):
+                for i in self.group.inputs[us : us + self._units]:
+                    # inputs are fixed-point activation_t
+                    #NOTE: check for absent or NaN
+                    if (i is None) or (i != i):
+                        inp = MLPConstants.ACTIV_NaN
+                    else:
+                        inp = int (i * (1 << MLPConstants.ACTIV_SHIFT))
+                    spec.write_value (inp, data_type = DataType.UINT32)
+                us += self.group.units
 
         # Reserve and write the routing region
         spec.reserve_memory_region (MLPRegions.ROUTING.value,
-                                    self._N_KEYS_BYTES)
+                                    self._KEYS_BYTES)
 
         spec.switch_write_focus (MLPRegions.ROUTING.value)
 
@@ -281,8 +285,9 @@ def generate_machine_data_specification(
         spec.write_value (routing_info.get_first_key_from_pre_vertex (
             self, self.fwd_link), data_type = DataType.UINT32)
 
-        # write link keys: bkp (padding - keys written below)
-        spec.write_value (0, data_type = DataType.UINT32)
+        # write link keys: bkp
+        spec.write_value (routing_info.get_first_key_from_pre_vertex (
+            self, self.bkp_link), data_type = DataType.UINT32)
 
         # write link keys: fds (padding)
         spec.write_value (0, data_type = DataType.UINT32)
@@ -293,19 +298,14 @@ def generate_machine_data_specification(
         # write link keys: lds (padding)
         spec.write_value (0, data_type = DataType.UINT32)
 
-        # write link keys: bkpi
-        for p in range (self.group.partitions):
-            spec.write_value (routing_info.get_first_key_from_pre_vertex (
-                self, self.bkp_link[p]), data_type = DataType.UINT32)
-
         # Reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
         # write the stage configuration into spec
-        for c in self._network.stage_config:
+        for c in self.network.stage_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         spec.end_specification ()
@@ -315,12 +315,12 @@ def generate_machine_data_specification(
     def regenerate_data_specification(self, spec, placement):
         # Reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
         # write the stage configuration into spec
-        for c in self._network.stage_config:
+        for c in self.network.stage_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         spec.end_specification()
diff --git a/spinn_pdp2/mlp_examples.py b/spinn_pdp2/mlp_examples.py
index 941744b..c670cbe 100644
--- a/spinn_pdp2/mlp_examples.py
+++ b/spinn_pdp2/mlp_examples.py
@@ -556,8 +556,11 @@ def read_Lens_examples_file (self,
         # clean up
         ef.close ()
 
-        # report examples read
-        print (f"{examples_file} contains {len (self.examples)} examples")
+        # report total number of examples read
+        s = '' if len (self.examples) == 1 else 's'
+        print (f"file {examples_file} contains "
+               f"{len (self.examples)} example{s}"
+               )
 
         # mark examples file as loaded
         self.examples_loaded = True
diff --git a/spinn_pdp2/mlp_group.py b/spinn_pdp2/mlp_group.py
index 67cc2cc..3996039 100644
--- a/spinn_pdp2/mlp_group.py
+++ b/spinn_pdp2/mlp_group.py
@@ -25,19 +25,20 @@ def __init__(self,
         self.write_blk    = write_blk
         self.is_first_out = is_first_out
         self.label        = label
-        self.VERBOSE      = VERBOSE
 
-        # number of partitions required for this group
-        self.partitions = (self.units + MLPConstants.MAX_BLK_UNITS - 1)\
-            // MLPConstants.MAX_BLK_UNITS
+        # number of subgroups required for this group
+        self.subgroups = (self.units + MLPConstants.MAX_SUBGROUP_UNITS - 1)\
+            // MLPConstants.MAX_SUBGROUP_UNITS
 
         if VERBOSE:
-            if self.partitions == 1:
-                print (f"creating group {self.label} with 1 partition")
-            else:
-                print (f"creating group {self.label} with "
-                       f"{self.partitions} partitions"
-                       )
+            s = '' if self.subgroups == 1 else 's'
+            print (f"creating group {self.label} with "
+                   f"{self.subgroups} subgroup{s}"
+                   )
+
+        # number of units per subgroup
+        self.subunits = [MLPConstants.MAX_SUBGROUP_UNITS] * (self.subgroups - 1)
+        self.subunits.append (self.units - sum (self.subunits))
 
         # keep track of associated incoming links
         self.links_from = []
@@ -53,9 +54,9 @@ def __init__(self,
 
         # keep track of associated vertices
         self.w_vertices = []
-        self.s_vertex   = None
-        self.i_vertex   = None
-        self.t_vertex   = None
+        self.s_vertex   = []
+        self.i_vertex   = []
+        self.t_vertex   = []
 
         # group function parameters
         self.output_grp = (MLPGroupTypes.OUTPUT in self.type)
diff --git a/spinn_pdp2/mlp_link.py b/spinn_pdp2/mlp_link.py
index 4715258..869b797 100644
--- a/spinn_pdp2/mlp_link.py
+++ b/spinn_pdp2/mlp_link.py
@@ -13,10 +13,8 @@ def __init__(self,
         self.pre_link_group  = pre_link_group
         self.post_link_group = post_link_group
         self.label           = label
-        self.VERBOSE         = VERBOSE
 
-        if VERBOSE:
-            print (f"creating link {self.label}")
+        if VERBOSE: print (f"creating link {self.label}")
 
         # update list of incoming links in the post_link_group
         self.post_link_group.links_from.append (self.pre_link_group)
diff --git a/spinn_pdp2/mlp_network.py b/spinn_pdp2/mlp_network.py
index 3262a13..7dc74fc 100644
--- a/spinn_pdp2/mlp_network.py
+++ b/spinn_pdp2/mlp_network.py
@@ -6,7 +6,7 @@
 from pacman.model.graphs.machine import MachineEdge
 
 from spinn_pdp2.input_vertex     import InputVertex
-from spinn_pdp2.sum_vertex       import SumVertex
+from spinn_pdp2.sum_vertex       import SumVertexTree
 from spinn_pdp2.threshold_vertex import ThresholdVertex
 from spinn_pdp2.weight_vertex    import WeightVertex
 from spinn_pdp2.mlp_types        import MLPGroupTypes, MLPConstants, \
@@ -88,11 +88,8 @@ def __init__(self,
         # initialise machine graph parameters
         self._graph_rdy = False
 
-        # keep track of the number of vertices in the graph
-        self._num_vertices = 0
-
-        # keep track of the number of partitions
-        self.partitions = 0
+        # keep track of the number of subgroups
+        self.subgroups = 0
 
         # keep track of the current execution stage
         self._stage_id = 0
@@ -109,6 +106,10 @@ def net_type (self):
     def training (self):
         return self._training
 
+    @property
+    def ex_set (self):
+        return self._ex_set
+
     @property
     def num_epochs (self):
         return self._num_epochs
@@ -125,6 +126,30 @@ def ticks_per_int (self):
     def global_max_ticks (self):
         return self._global_max_ticks
 
+    @property
+    def train_group_crit (self):
+        return self._train_group_crit
+
+    @property
+    def test_group_crit (self):
+        return self._test_group_crit
+
+    @property
+    def learning_rate (self):
+        return self._learning_rate
+
+    @property
+    def weight_decay (self):
+        return self._weight_decay
+
+    @property
+    def momentum (self):
+        return self._momentum
+
+    @property
+    def update_function (self):
+        return self._update_function
+
     @property
     def rec_test_results (self):
         return self._rec_test_results
@@ -141,10 +166,6 @@ def rec_outputs (self):
     def rec_example_last_tick_only (self):
         return self._rec_example_last_tick_only
 
-    @property
-    def num_write_blocks (self):
-        return self._num_write_blks
-
     @property
     def output_chain (self):
         return self._output_chain
@@ -163,17 +184,15 @@ def network_config (self):
               uchar net_type;
               uint  ticks_per_int;
               uint  global_max_ticks;
-              uint  num_write_blks;
             } network_conf_t;
 
             pack: standard sizes, little-endian byte order,
             explicit padding
         """
-        return struct.pack("<B3x3I",
+        return struct.pack("<B3x2I",
                            self._net_type,
                            self._ticks_per_interval,
                            self._global_max_ticks,
-                           self._num_write_blks
                            )
 
 
@@ -323,10 +342,10 @@ def link (self,
         """
         # machine graph needs rebuilding
         self._graph_rdy = False
-        
+
         # check that enough data is provided
         if (pre_link_group is None) or (post_link_group is None):
-            print ("error: pre- and post-link groups required")
+            print ("error: pre-link and post-link groups required")
             return None
 
         if label is None:
@@ -608,21 +627,13 @@ def write_Lens_output_file (self,
 
         if not self._aborted:
             with open(output_file, 'w') as f:
-                # prepare to retrieve recorded data
-                TICK_DATA_FORMAT = "<4I"
-                TICK_DATA_SIZE = struct.calcsize(TICK_DATA_FORMAT)
-
-                OUT_DATA_FORMATS = []
-                OUT_DATA_SIZES = []       
-                for g in self.output_chain:
-                    OUT_DATA_FORMATS.append ("<{}H".format (g.units))
-                    OUT_DATA_SIZES.append (struct.calcsize("<{}H".format (g.units)))
 
-                # retrieve recorded tick_data from first output group
+                # retrieve recorded tick_data from first output subgroup
                 g = self.out_grps[0]
+                ftv = g.t_vertex[0]
                 try:
-                    rec_tick_data = g.t_vertex.read (
-                        gfe.placements().get_placement_of_vertex (g.t_vertex),
+                    rec_tick_data = ftv.read (
+                        gfe.placements().get_placement_of_vertex (ftv),
                         gfe.buffer_manager(), MLPExtraRecordings.TICK_DATA.value
                         )
                 except Exception as err:
@@ -631,21 +642,24 @@ def write_Lens_output_file (self,
                     print ("--------------------------------------------------\n")
                     return
 
-                TOTAL_TICKS = len (rec_tick_data) // TICK_DATA_SIZE
-
                 # retrieve recorded outputs from every output group
                 rec_outputs = [None] * len (self.out_grps)
                 for g in self.out_grps:
-                    try:
-                        rec_outputs[g.write_blk] = g.t_vertex.read (
-                            gfe.placements().get_placement_of_vertex (g.t_vertex),
-                            gfe.buffer_manager(), MLPVarSizeRecordings.OUTPUTS.value
-                            )
-                    except Exception as err:
-                        print ("\n--------------------------------------------------")
-                        print (f"error: write output file aborted - {err}")
-                        print ("--------------------------------------------------\n")
-                        return
+                    rec_outputs[g.write_blk] = []
+                    # append all subgroups together
+                    for s in range (g.subgroups):
+                        gtv = g.t_vertex[s]
+                        try:
+                            rec_outputs[g.write_blk].append (gtv.read (
+                                gfe.placements().get_placement_of_vertex (gtv),
+                                gfe.buffer_manager(),
+                                MLPVarSizeRecordings.OUTPUTS.value)
+                                )
+                        except Exception as err:
+                            print ("\n--------------------------------------------------")
+                            print (f"error: write output file aborted - {err}")
+                            print ("--------------------------------------------------\n")
+                            return
 
                 # compute total ticks in first example
                 #TODO: need to get actual value from simulation, not max value
@@ -665,6 +679,12 @@ def write_Lens_output_file (self,
                     if ticks_per_example > self.global_max_ticks:
                         ticks_per_example = self.global_max_ticks
 
+                # prepare to retrieve recorded data
+                TICK_DATA_FORMAT = "<4I"
+                TICK_DATA_SIZE = struct.calcsize(TICK_DATA_FORMAT)
+
+                TOTAL_TICKS = len (rec_tick_data) // TICK_DATA_SIZE
+
                 # print recorded data in correct order
                 current_epoch = -1
                 for tk in range (TOTAL_TICKS):
@@ -709,27 +729,28 @@ def write_Lens_output_file (self,
                     f.write (f"{tick} {event}\n")
 
                     for g in self.output_chain:
-                        # get group tick outputs
-                        outputs = struct.unpack_from(
-                            OUT_DATA_FORMATS[self.output_chain.index(g)],
-                            rec_outputs[g.write_blk],
-                            tk * OUT_DATA_SIZES[self.output_chain.index(g)]
-                            )
+                        outputs = []
+                        # get tick outputs for each subgroup
+                        for sg, rec_outs in enumerate (rec_outputs[g.write_blk]):
+                            outputs += struct.unpack_from (
+                                f"<{g.subunits[sg]}H",
+                                rec_outs,
+                                tk * struct.calcsize(f"<{g.subunits[sg]}H")
+                                )
 
                         # print outputs
-                        if len (rec_outputs[g.write_blk]):
-                            f.write (f"{g.units} 1\n")
-                            tinx = tgt_inx * g.units
-                            for u in range (g.units):
-                                # outputs are s16.15 fixed-point numbers
-                                out = (1.0 * outputs[u]) / (1.0 * (1 << 15))
-                                t = g.targets[tinx + u]
-                                #NOTE: check for absent or NaN
-                                if (t is None) or (t != t):
-                                    tgt = "-"
-                                else:
-                                    tgt = int(t)
-                                f.write ("{:8.6f} {}\n".format (out, tgt))
+                        f.write (f"{g.units} 1\n")
+                        tinx = tgt_inx * g.units
+                        for u in range (g.units):
+                            # outputs are s16.15 fixed-point numbers
+                            out = (1.0 * outputs[u]) / (1.0 * (1 << 15))
+                            t = g.targets[tinx + u]
+                            #NOTE: check for absent or NaN
+                            if (t is None) or (t != t):
+                                tgt = "-"
+                            else:
+                                tgt = int (t)
+                            f.write ("{:8.6f} {}\n".format (out, tgt))
 
         # recorded data no longer available
         self._rec_data_rdy = False
@@ -760,11 +781,12 @@ def show_test_results (self):
             TEST_RESULTS_FORMAT = "<4I"
             TEST_RESULTS_SIZE = struct.calcsize(TEST_RESULTS_FORMAT)
 
-            # retrieve recorded tick_data from last output group
+            # retrieve recorded test results from last output subgroup
             g = self.out_grps[-1]
+            ltv = g.t_vertex[g.subgroups - 1]
             try:
-                rec_test_results = g.t_vertex.read (
-                    gfe.placements().get_placement_of_vertex (g.t_vertex),
+                rec_test_results = ltv.read (
+                    gfe.placements().get_placement_of_vertex (ltv),
                     gfe.buffer_manager(), MLPConstSizeRecordings.TEST_RESULTS.value
                     )
             except Exception as err:
@@ -800,150 +822,242 @@ def generate_machine_graph (self):
         # path to binary files
         binaries_path = os.path.join(os.path.dirname(__file__), "..", "binaries")
 
-        # setup the machine graph
-        gfe.setup (model_binary_folder = binaries_path)
+        # estimate number of SpiNNaker boards required
+        # number of subgroups
+        for grp in self.groups:
+            self.subgroups += grp.subgroups
 
-        # set the number of write blocks before generating vertices
-        self._num_write_blks = len (self.output_chain)
+        # number of required cores
+        w_cores = self.subgroups * self.subgroups
+        s_cores = self.subgroups * (((self.subgroups - 2) //
+                                    (MLPConstants.MAX_S_CORE_LINKS - 1)) + 1)
+        i_cores = self.subgroups
+        t_cores = self.subgroups
+        cores = w_cores + s_cores + i_cores + t_cores
 
-        # compute number of partitions
-        for grp in self.groups:
-            self.partitions = self.partitions + grp.partitions
+        s = '' if cores == 1 else 's'
+        print (f"need {cores} SpiNNaker core{s}")
+
+        # number of required chips
+        chips = ((cores - 1) // MLPConstants.DEF_SPINN_CORES_PER_CHIP) + 1
+
+        s = '' if chips == 1 else 's'
+        print (f"estimating {chips} SpiNNaker chip{s}")
+
+        # number of required boards
+        boards = ((chips - 1) // MLPConstants.DEF_SPINN_CHIPS_PER_BOARD) + 1
+
+        s = '' if boards == 1 else 's'
+        print (f"requesting {boards} SpiNNaker board{s}")
 
-        # create associated weight, sum, input and threshold
-        # machine vertices for every network group
+        # request a SpiNNaker machine and setup the machine graph
+        try:
+            gfe.setup (model_binary_folder = binaries_path,
+                       n_boards_required = boards
+                       )
+        except Exception as err:
+            print ("\n--------------------------------------------------")
+            print (f"error: {err}")
+            print ("--------------------------------------------------\n")
+            return False
+
+        # create weight, sum, input and threshold
+        # machine vertices associated with every subgroup
         for grp in self.groups:
-            # create one weight core per partition
-            # of every (from_group, group) pair
-            # NOTE: all-zero cores can be optimised out
-            for from_grp in self.groups:
-                for _tp in range (grp.partitions):
-                    for _fp in range (from_grp.partitions):
-                        wv = WeightVertex (self, grp, from_grp, _tp, _fp)
-                        grp.w_vertices.append (wv)
+            for sgrp in range (grp.subgroups):
+                # create one weight core for every
+                # (from_group/from_subgroup, group/subgroup) pair
+                #TODO: all-zero cores can be optimised out
+                wvs = []
+                for from_grp in self.groups:
+                    for from_sgrp in range (from_grp.subgroups):
+                        wv = WeightVertex (self, grp, sgrp,
+                                           from_grp, from_sgrp)
                         gfe.add_machine_vertex_instance (wv)
-                        self._num_vertices += 1
-
-            # create one sum core per group
-            sv = SumVertex (self, grp)
-            grp.s_vertex = sv
-            gfe.add_machine_vertex_instance (sv)
-            self._num_vertices += 1
-
-            # create one input core per group
-            iv = InputVertex (self, grp)
-            grp.i_vertex = iv
-            gfe.add_machine_vertex_instance (iv)
-            self._num_vertices += 1
-
-            # create one threshold core per group
-            tv = ThresholdVertex (self, grp)
-            grp.t_vertex = tv
-            gfe.add_machine_vertex_instance (tv)
-            self._num_vertices += 1
+                        wvs.append (wv)
+                grp.w_vertices.append (wvs)
+
+                # create a sum core tree per subgroup
+                #NOTE: sum vertices are added during tree building
+                svt = SumVertexTree (self, grp, sgrp)
+                grp.s_vertex.append (svt)
+
+                # create one input core per subgroup
+                iv = InputVertex (self, grp, sgrp)
+                grp.i_vertex.append (iv)
+                gfe.add_machine_vertex_instance (iv)
+
+                # create one threshold core per subgroup
+                tv = ThresholdVertex (self, grp, sgrp)
+                grp.t_vertex.append (tv)
+                gfe.add_machine_vertex_instance (tv)
+
+        # groups and subgroups with special functions
+        first_lds_grp = self.groups[0]
+        first_subgroup_svt = first_lds_grp.s_vertex[0]
+
+        last_out_grp = self.output_chain[-1]
+        last_out_subgroup_t_vertex = (
+            last_out_grp.t_vertex[last_out_grp.subgroups - 1]
+            )
 
         # create associated forward, backprop, link delta summation,
-        # synchronisation and stop machine edges for every network group
-        first = self.groups[0]
+        # criterion, stop and sync machine edges for every subgroup
         for grp in self.groups:
-            for w in grp.w_vertices:
-                _frmg = w.from_group
-
-                # create forward w to s links
-                gfe.add_machine_edge_instance (MachineEdge (w, grp.s_vertex),
-                                             w.fwd_link)
-
-                # create forward t to w (multicast) links
-                gfe.add_machine_edge_instance (MachineEdge (_frmg.t_vertex, w),
-                                             _frmg.t_vertex.fwd_link[w.row_blk])
-
-                # create backprop w to s links
-                gfe.add_machine_edge_instance (MachineEdge (w, _frmg.s_vertex),
-                                             w.bkp_link)
-
-                # create backprop i to w (multicast) links
-                gfe.add_machine_edge_instance (MachineEdge (grp.i_vertex, w),
-                                             grp.i_vertex.bkp_link[w.col_blk])
-
-                # create link delta summation w to s links
-                gfe.add_machine_edge_instance (MachineEdge (w, grp.s_vertex),
-                                             w.lds_link)
-
-                # create link delta summation result s (first) to w links
-                gfe.add_machine_edge_instance (MachineEdge (first.s_vertex, w),
-                                             first.s_vertex.lds_link)
-
-                # create example synchronisation s to w (multicast) links
-                gfe.add_machine_edge_instance (MachineEdge (grp.s_vertex, w),
-                                               grp.s_vertex.fds_link)
-
-                if grp != _frmg:
-                    gfe.add_machine_edge_instance (MachineEdge (_frmg.s_vertex, w),
-                                                 _frmg.s_vertex.fds_link)
-
-            # create forward s to i link
-            gfe.add_machine_edge_instance (MachineEdge (grp.s_vertex,
-                                                      grp.i_vertex),
-                                         grp.s_vertex.fwd_link)
-
-            # create backprop s to t link
-            gfe.add_machine_edge_instance (MachineEdge (grp.s_vertex,
-                                                      grp.t_vertex),
-                                         grp.s_vertex.bkp_link)
-
-            # create forward i to t link
-            gfe.add_machine_edge_instance (MachineEdge (grp.i_vertex,
-                                                      grp.t_vertex),
-                                         grp.i_vertex.fwd_link)
-
-            # create backprop t to i link
-            gfe.add_machine_edge_instance (MachineEdge (grp.t_vertex,
-                                                      grp.i_vertex),
-                                         grp.t_vertex.bkp_link)
-
-            # create link delta summation s to s links - all s cores
-            # (except the first) send to the first s core
-            if grp != first:
-                gfe.add_machine_edge_instance (MachineEdge (grp.s_vertex,
-                                                          first.s_vertex),
-                                             grp.s_vertex.lds_link)
-
-            # create stop links, if OUTPUT group
-            if grp in self.output_chain:
-                # if last OUTPUT group broadcast stop decision
-                if grp == self.output_chain[-1]:
-                    for stpg in self.groups:
-                        # create stop links to all w cores
-                        for w in stpg.w_vertices:
-                            gfe.add_machine_edge_instance\
-                              (MachineEdge (grp.t_vertex, w),
-                               grp.t_vertex.stp_link)
-
-                        # create stop links to all s cores
-                        gfe.add_machine_edge_instance\
-                         (MachineEdge (grp.t_vertex, stpg.s_vertex),\
-                          grp.t_vertex.stp_link)
-
-                        # create stop links to all i cores
-                        gfe.add_machine_edge_instance\
-                         (MachineEdge (grp.t_vertex, stpg.i_vertex),\
-                          grp.t_vertex.stp_link)
-
-                        # create stop links to t cores (no link to itself!)
-                        if stpg != grp:
-                            gfe.add_machine_edge_instance\
-                             (MachineEdge (grp.t_vertex, stpg.t_vertex),\
-                              grp.t_vertex.stp_link)
-                else:
-                    # create stop link to next OUTPUT group in chain
-                    _inx  = self.output_chain.index (grp)
-                    _stpg = self.output_chain[_inx + 1]
-                    gfe.add_machine_edge_instance (MachineEdge (grp.t_vertex,
-                                                              _stpg.t_vertex),
-                                                 grp.t_vertex.stp_link)
+            for sgrp in range (grp.subgroups):
+                svt = grp.s_vertex[sgrp]
+                iv  = grp.i_vertex[sgrp]
+                tv  = grp.t_vertex[sgrp]
+
+                for wv in grp.w_vertices[sgrp]:
+                    from_grp  = wv.from_group
+                    from_sgrp = wv.from_subgroup
+
+                    from_svt = from_grp.s_vertex[from_sgrp]
+                    from_tv  = from_grp.t_vertex[from_sgrp]
+
+                    # sum tree leaf to connect to depends on group/subgroup 
+                    svt_leaf      = svt.leaf (from_grp, from_sgrp)
+                    from_svt_leaf = from_svt.leaf (grp, sgrp)
+
+                    # forward w to s link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (wv, svt_leaf),
+                        wv.fwd_link
+                        )
+
+                    # forward t to w (multicast) link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (from_tv, wv),
+                        from_tv.fwd_link
+                        )
+
+                    # backprop w to s link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (wv, from_svt_leaf),
+                        wv.bkp_link
+                        )
+
+                    # backprop i to w (multicast) link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (iv, wv),
+                        iv.bkp_link
+                        )
+
+                    # link delta summation w to s link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (wv, svt_leaf),
+                        wv.lds_link
+                        )
+
+                    # link delta result (first group) s to w (multicast) link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (first_subgroup_svt.root, wv),
+                        first_subgroup_svt.root.lds_link
+                        )
+
+                    # stop (last output group/subgroup) t to w (multicast) link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (last_out_subgroup_t_vertex, wv),
+                        last_out_subgroup_t_vertex.stp_link
+                        )
+
+                    # intra-subgroup sync s to w (multicast) link
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (svt.root, wv),
+                        svt.root.fds_link
+                        )
+
+                    # inter-subgroup sync s to w (multicast) link
+                    #NOTE: avoid duplicates
+                    if grp != from_grp or sgrp != from_sgrp:
+                        gfe.add_machine_edge_instance (
+                            MachineEdge (from_svt.root, wv),
+                            from_svt.root.fds_link
+                            )
+
+                # forward s to i link
+                gfe.add_machine_edge_instance (
+                    MachineEdge (svt.root, iv),
+                    svt.root.fwd_link
+                    )
+
+                # forward i to t link
+                gfe.add_machine_edge_instance (
+                    MachineEdge (iv, tv),
+                    iv.fwd_link
+                    )
+
+                # backprop t to i link
+                gfe.add_machine_edge_instance (
+                    MachineEdge (tv, iv),
+                    tv.bkp_link
+                    )
+
+                # backprop s to t link
+                gfe.add_machine_edge_instance (
+                    MachineEdge (svt.root, tv),
+                    svt.root.bkp_link
+                    )
+
+                # link delta summation s to s link
+                if sgrp != 0:
+                    # first subgroup collects from all other subgroups
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (
+                            svt.root,
+                            grp.s_vertex[0].root
+                            ),
+                        svt.root.lds_link
+                        )
+                elif grp != first_lds_grp:
+                    # first group collects from all other groups
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (
+                            svt.root,
+                            first_subgroup_svt.root
+                            ),
+                        svt.root.lds_link
+                        )
+
+                # t to t criterion link 
+                # intra-group criterion link to last subgroup t
+                if sgrp < (grp.subgroups - 1):
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (tv, grp.t_vertex[grp.subgroups - 1]),
+                        tv.stp_link
+                        )
+                elif grp != last_out_grp:
+                    # inter-group criterion link to last output subgroup
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (tv, last_out_subgroup_t_vertex),
+                        tv.stp_link
+                        )
+
+                # stop (last output group/subgroup) t to s (multicast) link
+                for s in svt.vertices:
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (last_out_subgroup_t_vertex, s),
+                        last_out_subgroup_t_vertex.stp_link
+                        )
+
+                # stop (last output group/subgroup) t to i (multicast) link
+                gfe.add_machine_edge_instance (
+                    MachineEdge (last_out_subgroup_t_vertex, iv),
+                    last_out_subgroup_t_vertex.stp_link
+                    )
+
+                # stop (last output group/subgroup) t to t (multicast) link
+                if tv != last_out_subgroup_t_vertex:
+                    gfe.add_machine_edge_instance (
+                        MachineEdge (last_out_subgroup_t_vertex, tv),
+                        last_out_subgroup_t_vertex.stp_link
+                        )
 
         self._graph_rdy = True
 
+        return True
+
 
     def train (self,
                update_function = None,
@@ -966,6 +1080,16 @@ def train (self,
         self._stg_reset = True
 
         self._training = 1
+
+        if self._stg_epochs == None:
+            updates = "default"
+        else:
+            updates = self._stg_epochs
+
+        print ("\n--------------------------------------------------")
+        print (f"stage {self._stage_id} train (updates: {updates})")
+        print ("--------------------------------------------------\n")
+
         self.stage_run ()
 
 
@@ -989,6 +1113,16 @@ def test (self,
         self._stg_reset = reset_examples
 
         self._training = 0
+
+        if self._stg_examples == None:
+            examples = "default"
+        else:
+            examples = self._stg_examples
+
+        print ("\n--------------------------------------------------")
+        print (f"stage {self._stage_id} test (examples: {examples})")
+        print ("--------------------------------------------------\n")
+
         self.stage_run ()
 
 
@@ -997,14 +1131,6 @@ def stage_run (self):
         """
         self._aborted = False
 
-        # check that no group is too big
-        for grp in self.groups:
-            if grp.units > MLPConstants.MAX_GRP_UNITS:
-                print (f"run aborted: group {grp.label} has more than "
-                       f"{MLPConstants.MAX_GRP_UNITS} units.")
-                self._aborted = True
-                return
-
         # cannot run unless weights file exists
         if self._weights_file is None:
             print ("run aborted: weights file not given")
@@ -1040,7 +1166,10 @@ def stage_run (self):
 
         # generate machine graph - if needed
         if not self._graph_rdy:
-            self.generate_machine_graph ()
+            if not self.generate_machine_graph ():
+                print ("run aborted: error generating machine graph")
+                self._aborted = True
+                return
 
         # initialise recorded data flag
         self._rec_data_rdy = False
@@ -1067,7 +1196,9 @@ def pause (self):
         """ pause execution to allow debugging
         """
         # pause until a key is pressed
+        print ("\n--------------------------------------------------")
         input ("network paused: press enter to continue")
+        print ("--------------------------------------------------\n")
 
 
     def end (self):
diff --git a/spinn_pdp2/mlp_types.py b/spinn_pdp2/mlp_types.py
index 583c7bf..53d0a92 100644
--- a/spinn_pdp2/mlp_types.py
+++ b/spinn_pdp2/mlp_types.py
@@ -12,6 +12,12 @@ class MLPUpdateFuncs (Enum):
 class MLPConstants ():
     """ MLP network constants
     """
+    # SpiNNaker machine DEFAULT values
+    #NOTE: leave room for monitor, 2 system-level and 1 blacklisted cores
+    DEF_SPINN_CORES_PER_CHIP = 14
+    #NOTE: leave room for 1 blacklisted chip
+    DEF_SPINN_CHIPS_PER_BOARD = 47
+
     # network parameter CONSTANTS or DEFAULT values
     DEF_LEARNING_RATE = 0.1
     DEF_WEIGHT_DECAY = 0
@@ -27,8 +33,8 @@ class MLPConstants ():
     MAX_IN_PROCS  = 2
     DEF_IN_PROCS  = 0
 
-    MAX_GRP_UNITS = 128
-    MAX_BLK_UNITS = 32
+    MAX_SUBGROUP_UNITS = 32
+    MAX_S_CORE_LINKS = 8
 
     MAX_OUT_PROCS = 5
     DEF_OUT_PROCS = 2
diff --git a/spinn_pdp2/sum_vertex.py b/spinn_pdp2/sum_vertex.py
index cd6265b..044255d 100644
--- a/spinn_pdp2/sum_vertex.py
+++ b/spinn_pdp2/sum_vertex.py
@@ -1,7 +1,10 @@
 import struct
 
+import spinnaker_graph_front_end as gfe
+
 from data_specification.enums.data_type import DataType
 
+from pacman.model.graphs.machine import MachineEdge
 from pacman.model.graphs.machine.machine_vertex import MachineVertex
 from pacman.model.resources.resource_container \
     import ResourceContainer, ConstantSDRAM
@@ -37,87 +40,81 @@ class SumVertex(
 
     def __init__(self,
                  network,
-                 group
+                 group,
+                 subgroup,
+                 index = 0
                  ):
 
+        self._network  = network
+        self._group    = group
+        self._subgroup = subgroup
+        self._index    = index
+
         super(SumVertex, self).__init__(
-            label = "s_core{}".format (group.id),
+            label = f"s_core{self.group.id}/{self.subgroup}/{self.index}",
             binary_name = "sum.aplx",
             constraints = None)
 
         self._stage = 0
 
         # application-level data
-        self._network = network
-        self._group   = group
-        self._set_cfg = network._ex_set.set_config
-        self._ex_cfg  = network._ex_set.example_config
-
-        # check if first group in the network
-        if self.group.id == network.groups[0].id:
-            self._is_first_group = 1
-        else:
-            self._is_first_group = 0
+        self._set_cfg = self.network.ex_set.set_config
+        self._ex_cfg  = self.network.ex_set.example_config
 
-        # forward, backprop, and link delta summation link partition names
-        self._fwd_link = "fwd_s{}".format (self.group.id)
-        self._bkp_link = "bkp_s{}".format (self.group.id)
-        self._lds_link = "lds_s{}".format (self.group.id)
-        self._fds_link = "fds_s{}".format (self.group.id)
+        # forward, backprop, link delta summation and sync link names
+        self._fwd_link = f"fwd_s{self.group.id}/{self.subgroup}"
+        self._bkp_link = f"bkp_s{self.group.id}/{self.subgroup}"
+        self._lds_link = f"lds_s{self.group.id}/{self.subgroup}"
+        self._fds_link = f"fds_s{self.group.id}/{self.subgroup}"
 
         # sum core-specific parameters
         # NOTE: if all-zero w cores are optimised out these need reviewing
-        self._fwd_expect  = network.partitions
-        self._bkp_expect  = network.partitions
-        self._ldsa_expect = network.partitions * self.group.units
-        self._ldst_expect = len (network.groups) - 1
-
-        # weight update function
-        self.update_function = network._update_function
-
-        # reserve key space for every link
-        self._n_keys = MLPConstants.KEY_SPACE_SIZE
-
-        # configuration and data files
-        # find out the size of an integer!
-        _data_int = DataType.INT32
+        self._units = self.group.subunits[self.subgroup]
 
+        # configuration and data sizes
         # network configuration structure
-        self._N_NETWORK_CONFIGURATION_BYTES = \
-            len (self._network.network_config)
+        self._NETWORK_CONFIGURATION_BYTES = len (self.network.network_config)
 
         # core configuration structure
-        self._N_CORE_CONFIGURATION_BYTES = \
-            len (self.config)
+        self._CORE_CONFIGURATION_BYTES = len (self.config)
 
         # set configuration structure
-        self._N_EXAMPLE_SET_BYTES = \
-            len (self._set_cfg)
+        self._EXAMPLE_SET_BYTES = len (self._set_cfg)
 
         # list of example configurations
-        self._N_EXAMPLES_BYTES = \
-            len (self._ex_cfg) * len (self._ex_cfg[0])
+        self._EXAMPLES_BYTES = len (self._ex_cfg) * len (self._ex_cfg[0])
 
-        # keys are integers
-        self._N_KEYS_BYTES = MLPConstants.NUM_KEYS_REQ * _data_int.size
+        # list of routing keys
+        self._KEYS_BYTES = MLPConstants.NUM_KEYS_REQ * (DataType.INT32).size
 
         # stage configuration structure
-        self._N_STAGE_CONFIGURATION_BYTES = \
-            len (self._network.stage_config)
+        self._STAGE_CONFIGURATION_BYTES = len (self.network.stage_config)
 
         self._sdram_usage = (
-            self._N_NETWORK_CONFIGURATION_BYTES + \
-            self._N_CORE_CONFIGURATION_BYTES + \
-            self._N_EXAMPLE_SET_BYTES + \
-            self._N_EXAMPLES_BYTES + \
-            self._N_KEYS_BYTES + \
-            self._N_STAGE_CONFIGURATION_BYTES
+            self._NETWORK_CONFIGURATION_BYTES +
+            self._CORE_CONFIGURATION_BYTES +
+            self._EXAMPLE_SET_BYTES +
+            self._EXAMPLES_BYTES +
+            self._KEYS_BYTES +
+            self._STAGE_CONFIGURATION_BYTES
         )
 
+    @property
+    def network (self):
+        return self._network
+
     @property
     def group (self):
         return self._group
 
+    @property
+    def subgroup (self):
+        return self._subgroup
+
+    @property
+    def index (self):
+        return self._index
+
     @property
     def fwd_link (self):
         return self._fwd_link
@@ -144,22 +141,70 @@ def config (self):
               uint         num_units;
               scoreboard_t fwd_expect;
               scoreboard_t bkp_expect;
-              scoreboard_t ldsa_expect;
-              scoreboard_t ldst_expect;
+              scoreboard_t lds_expect;
               uchar        is_first_group;
+              uchar        is_tree_root;
             } s_conf_t;
 
             pack: standard sizes, little-endian byte order,
             explicit padding
         """
+        # check if first group in the network
+        if self.group == self.network.groups[0]:
+            is_first_group = 1
+        else:
+            is_first_group = 0
+
+        # number of vertices in this SumVertex tree
+        num_vrt = ((self.network.subgroups - 2) //
+                   (MLPConstants.MAX_S_CORE_LINKS - 1)) + 1
 
-        return struct.pack ("<5IB3x",
-                            self.group.units,
-                            self._fwd_expect,
-                            self._bkp_expect,
-                            self._ldsa_expect,
-                            self._ldst_expect,
-                            self._is_first_group
+        lvs = ((num_vrt - 1) * (MLPConstants.MAX_S_CORE_LINKS - 1))
+
+        # number of expected packets
+        if self.index == (num_vrt - 1):
+            # the last vertex in the tree may expect fewer packets
+            #NOTE: this could be the root in a single-vertex tree
+            expected = self.network.subgroups - lvs
+        else:
+            expected = MLPConstants.MAX_S_CORE_LINKS
+
+        # keep track of these on a unit-by-unit basis
+        fwd_expect = expected
+        bkp_expect = expected
+
+        # keep track of the total, not unit-by-unit, count of lds packets
+        k = lvs // MLPConstants.MAX_S_CORE_LINKS
+        if self.index > (num_vrt - 2 - k):
+            # lds packets from w cores only
+            lds_expect = expected * self._units
+        elif self.index == (num_vrt - 2 - k):
+            # lds packets from w cores and other s cores
+            wp = lvs % MLPConstants.MAX_S_CORE_LINKS
+            sp = MLPConstants.MAX_S_CORE_LINKS - wp
+            lds_expect = wp * self._units + sp
+        else:
+            # lds packets from other s cores only
+            lds_expect = MLPConstants.MAX_S_CORE_LINKS
+
+        # first subgroup expects a partial lds from every other subgroup
+        if self.index == 0 and self.subgroup == 0:
+            lds_expect += self.group.subgroups - 1
+
+            # first group expects a partial lds from every other group
+            if is_first_group:
+                lds_expect += len (self.network.groups) - 1
+
+        # is this the root of a SumVertex tree?
+        is_tree_root = self.index == 0
+
+        return struct.pack ("<4I2B2x",
+                            self._units,
+                            fwd_expect,
+                            bkp_expect,
+                            lds_expect,
+                            is_first_group,
+                            is_tree_root
                             )
 
     @property
@@ -173,7 +218,7 @@ def resources_required (self):
 
     @overrides (AbstractProvidesNKeysForPartition.get_n_keys_for_partition)
     def get_n_keys_for_partition (self, partition, graph_mapper):
-        return self._n_keys
+        return MLPConstants.KEY_SPACE_SIZE
 
 
     @overrides(MachineDataSpecableVertex.generate_machine_data_specification)
@@ -186,17 +231,17 @@ def generate_machine_data_specification(
 
         # Reserve and write the network configuration region
         spec.reserve_memory_region (MLPRegions.NETWORK.value,
-                                    self._N_NETWORK_CONFIGURATION_BYTES)
+                                    self._NETWORK_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.NETWORK.value)
 
         # write the network configuration into spec
-        for c in self._network.network_config:
+        for c in self.network.network_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         # Reserve and write the core configuration region
         spec.reserve_memory_region (MLPRegions.CORE.value,
-                                    self._N_CORE_CONFIGURATION_BYTES)
+                                    self._CORE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.CORE.value)
 
@@ -206,7 +251,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the example set region
         spec.reserve_memory_region (MLPRegions.EXAMPLE_SET.value,
-                                    self._N_EXAMPLE_SET_BYTES)
+                                    self._EXAMPLE_SET_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLE_SET.value)
 
@@ -216,7 +261,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the examples region
         spec.reserve_memory_region (MLPRegions.EXAMPLES.value,
-                                    self._N_EXAMPLES_BYTES)
+                                    self._EXAMPLES_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLES.value)
 
@@ -227,7 +272,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the routing region
         spec.reserve_memory_region (MLPRegions.ROUTING.value,
-                                    self._N_KEYS_BYTES)
+                                    self._KEYS_BYTES)
 
         spec.switch_write_focus (MLPRegions.ROUTING.value)
 
@@ -239,9 +284,12 @@ def generate_machine_data_specification(
         spec.write_value (routing_info.get_first_key_from_pre_vertex (
             self, self.bkp_link), data_type = DataType.UINT32)
 
-        # write link keys: fds
-        spec.write_value (routing_info.get_first_key_from_pre_vertex (
-            self, self.fds_link), data_type = DataType.UINT32)
+        # write link keys: fds (padding if not SumVertex tree root)
+        if (self.index == 0):
+            spec.write_value (routing_info.get_first_key_from_pre_vertex (
+                self, self.fds_link), data_type = DataType.UINT32)
+        else:
+            spec.write_value (0, data_type = DataType.UINT32)
 
         # write link keys: stp (padding)
         spec.write_value (0, data_type = DataType.UINT32)
@@ -252,12 +300,12 @@ def generate_machine_data_specification(
 
         # Reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
         # write the stage configuration into spec
-        for c in self._network.stage_config:
+        for c in self.network.stage_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         spec.end_specification ()
@@ -267,12 +315,12 @@ def generate_machine_data_specification(
     def regenerate_data_specification(self, spec, placement):
         # Reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
         # write the stage configuration into spec
-        for c in self._network.stage_config:
+        for c in self.network.stage_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         spec.end_specification()
@@ -290,3 +338,104 @@ def mark_regions_reloaded(self):
         """
         # prepare for next stage
         self._stage += 1
+
+
+#---------------------------------------------------------------------
+class SumVertexTree(
+        ):
+
+    """ implements a tree of sum vertices
+    """
+
+    def __init__(self,
+                 network,
+                 group,
+                 subgroup
+                 ):
+
+        max_links = MLPConstants.MAX_S_CORE_LINKS
+
+        # total number of Sum Vertices needed to build the tree
+        num_vrt = ((network.subgroups - 2) // (max_links - 1)) + 1
+
+        # the root vertex is used as pre-vertex for outgoing links
+        self._root = SumVertex (network, group, subgroup, 0)
+
+        # add the root to the graph
+        gfe.add_machine_vertex_instance (self.root)
+
+        # and to the list of all tree vertices
+        self._vertices = [self.root]
+
+        # create the SumVertex tree
+        free_links = max_links
+        to_vrt = 0
+        for vrt in range (1, num_vrt):
+            # create a SumVertex
+            vt = SumVertex (network, group, subgroup, vrt)
+
+            # add it to the list of vertices
+            self._vertices.append (vt)
+
+            # add it to the graph
+            gfe.add_machine_vertex_instance (vt)
+
+            # add all SumVertex links towards the tree root
+            gfe.add_machine_edge_instance (
+                MachineEdge (vt, self.vertices[to_vrt]), vt.fwd_link
+                )
+
+            gfe.add_machine_edge_instance (
+                MachineEdge (vt, self.vertices[to_vrt]), vt.bkp_link
+                )
+
+            gfe.add_machine_edge_instance (
+                MachineEdge (vt, self.vertices[to_vrt]), vt.lds_link
+                )
+
+            # take away one free link from vertex to_vrt
+            free_links -= 1
+
+            # if out of free links use next available vertex
+            if free_links == 0:
+                free_links = max_links
+                to_vrt += 1
+
+        # finally, map every pre-vertex to an available tree vertex
+        self._leaf_map = {}
+        for grp in network.groups:
+            for sgrp in range (grp.subgroups):
+                # assign available leaf vertex
+                self._leaf_map[(grp.id, sgrp)] = self.vertices[to_vrt]
+
+                # take away one free link from vertex to_vrt
+                free_links -= 1
+
+                # if out of free links use next available vertex
+                if free_links == 0:
+                    free_links = max_links
+                    to_vrt += 1
+
+
+    def leaf (self, group, subgroup):
+        """ returns the leaf SumVertex to link to
+            from a pre-vertex in group/subgroup
+
+        :param group:    pre-vertex group
+        :param subgroup: pre-vertex subgroup number
+
+        :type group:    MLPGroup
+        :type subgroup: integer
+
+        :return: a SumVertex
+        """
+        return self._leaf_map[(group.id, subgroup)]
+
+
+    @property
+    def root (self):
+        return self._root
+
+    @property
+    def vertices (self):
+        return self._vertices
diff --git a/spinn_pdp2/threshold_vertex.py b/spinn_pdp2/threshold_vertex.py
index ea01205..15999f2 100644
--- a/spinn_pdp2/threshold_vertex.py
+++ b/spinn_pdp2/threshold_vertex.py
@@ -47,130 +47,121 @@ class ThresholdVertex(
 
     def __init__(self,
                  network,
-                 group
+                 group,
+                 subgroup
                  ):
 
-        # place OUTPUT groups "close" to the host
-        if group.output_grp:
-            constraints = [ChipAndCoreConstraint (x = 0, y = 0)]
-        else:
-            constraints = None
+        self._network  = network
+        self._group    = group
+        self._subgroup = subgroup
 
         super(ThresholdVertex, self).__init__(
-            label = "t_core{}".format (group.id),
+            label = f"t_core{self.group.id}/{self.subgroup}",
             binary_name = "threshold.aplx",
-            constraints = constraints)
+            constraints = None)
 
         self._stage = 0
 
         # application-level data
-        self._network = network
-        self._group   = group
-        self._set_cfg = network._ex_set.set_config
-        self._ex_cfg  = network._ex_set.example_config
-        self._ev_cfg  = network._ex_set.event_config
+        self._set_cfg = self.network.ex_set.set_config
+        self._ex_cfg  = self.network.ex_set.example_config
+        self._ev_cfg  = self.network.ex_set.event_config
 
         # application parameters
-        self._out_integr_dt = 1.0 / network.ticks_per_int
+        self._out_integr_dt = 1.0 / self.network.ticks_per_int
 
-        # choose appropriate group criteria
         if self.group.test_group_crit is not None:
             self._tst_group_criterion = self.group.test_group_crit
-        elif network._test_group_crit is not None:
-            self._tst_group_criterion = network._test_group_crit
+        elif self.network.test_group_crit is not None:
+            self._tst_group_criterion = self.network.test_group_crit
         else:
             self._tst_group_criterion = MLPConstants.DEF_GRP_CRIT
 
         if self.group.train_group_crit is not None:
             self._trn_group_criterion = self.group.train_group_crit
-        elif network._train_group_crit is not None:
-            self._trn_group_criterion = network._train_group_crit
+        elif self.network.train_group_crit is not None:
+            self._trn_group_criterion = self.network.train_group_crit
         else:
             self._trn_group_criterion = MLPConstants.DEF_GRP_CRIT
 
-        # check if last output group in daisy chain
-        if self.group == network.output_chain[-1]:
-            self._is_last_output_group = 1
-        else:
-            self._is_last_output_group = 0
+        # forward, backprop and stop link names
+        self._fwd_link = f"fwd_t{self.group.id}/{self.subgroup}"
+        self._bkp_link = f"bkp_t{self.group.id}/{self.subgroup}"
+        self._stp_link = f"stp_t{self.group.id}/{self.subgroup}"
 
-        # forward, backprop and stop link partition names
-        self._fwd_link = []
-        for p in range (self._group.partitions):
-            self._fwd_link.append ("fwd_t{}_{}".format (self.group.id, p))
-        self._bkp_link = "bkp_t{}".format (self.group.id)
-        self._stp_link = "stp_t{}".format (self.group.id)
+        # threshold core-specific parameters
+        self._units = self.group.subunits[self.subgroup]
 
-        # reserve key space for every link
-        self._n_keys = MLPConstants.KEY_SPACE_SIZE
+        # first output subgroup has special functions
+        self._is_first_out = self.group.is_first_out and (self.subgroup == 0)
 
-        # configuration and data files
-        # find out the size of an integer!
-        _data_int = DataType.INT32
+        # last output subgroup has special functions
+        self._is_last_out = ((self.group == self.network.output_chain[-1]) and
+                             (self.subgroup == (self.group.subgroups - 1)))
 
+        # configuration and data sizes
         # network configuration structure
-        self._N_NETWORK_CONFIGURATION_BYTES = \
-            len (self.network.network_config)
+        self._NETWORK_CONFIGURATION_BYTES = len (self.network.network_config)
 
         # core configuration structure
-        self._N_CORE_CONFIGURATION_BYTES = \
-            len (self.config)
+        self._CORE_CONFIGURATION_BYTES = len (self.config)
 
         # set configuration structure
-        self._N_EXAMPLE_SET_BYTES = \
-            len (self._set_cfg)
+        self._EXAMPLE_SET_BYTES = len (self._set_cfg)
 
         # list of example configurations
-        self._N_EXAMPLES_BYTES = \
-            len (self._ex_cfg) * len (self._ex_cfg[0])
+        self._EXAMPLES_BYTES = len (self._ex_cfg) * len (self._ex_cfg[0])
 
         # list of event configurations
-        self._N_EVENTS_BYTES = \
-            len (self._ev_cfg) * len (self._ev_cfg[0])
+        self._EVENTS_BYTES = len (self._ev_cfg) * len (self._ev_cfg[0])
 
-        # list of group inputs (empty if not an INPUT group)
-        self._N_INPUTS_BYTES = \
-            len (self._group.inputs) * _data_int.size
+        # list of subgroup inputs (empty if not an INPUT group)
+        if self.group.input_grp:
+            self._INPUTS_BYTES = ((len (self.group.inputs) // self.group.units) *
+                                  self._units * DataType.INT32.size)
+        else:
+            self._INPUTS_BYTES = 0
 
-        # list of group targets (empty if not an OUTPUT group)
-        self._N_TARGETS_BYTES = \
-            len (self._group.targets) * _data_int.size
+        # list of subgroup targets (empty if not an OUTPUT group)
+        if self.group.output_grp:
+            self._TARGETS_BYTES = ((len (self.group.targets) // self.group.units) *
+                                  self._units * DataType.INT32.size)
+        else:
+            self._TARGETS_BYTES = 0
 
-        # keys are integers
-        # t cores require a different key for every group partition
-        self._N_KEYS_BYTES =  _data_int.size * \
-            (MLPConstants.NUM_KEYS_REQ + self._group.partitions)
+        # list of routing keys
+        self._KEYS_BYTES = MLPConstants.NUM_KEYS_REQ * DataType.INT32.size
 
         # stage configuration structure
-        self._N_STAGE_CONFIGURATION_BYTES = \
-            len (self.network.stage_config)
+        self._STAGE_CONFIGURATION_BYTES = len (self.network.stage_config)
 
         # reserve SDRAM space used to store historic data
-        self._TARGET_HISTORY_BYTES = (MLPConstants.ACTIV_SIZE // 8) * \
-            self.group.units * self.network.global_max_ticks
+        #NOTE: MLPConstants sizes are in bits
+        self._TARGET_HISTORY_BYTES = ((MLPConstants.ACTIV_SIZE // 8) *
+            self._units * self.network.global_max_ticks)
 
-        self._OUT_DERIV_HISTORY_BYTES = (MLPConstants.LONG_DERIV_SIZE // 8) * \
-            self.group.units * self.network.global_max_ticks
+        self._OUT_DERIV_HISTORY_BYTES = ((MLPConstants.LONG_DERIV_SIZE // 8) *
+            self._units * self.network.global_max_ticks)
 
-        self._NET_HISTORY_BYTES = (MLPConstants.NET_SIZE // 8) * \
-            self.group.units * self.network.global_max_ticks
+        self._NET_HISTORY_BYTES = ((MLPConstants.NET_SIZE // 8) *
+            self._units * self.network.global_max_ticks)
 
-        self._OUTPUT_HISTORY_BYTES = (MLPConstants.ACTIV_SIZE // 8) * \
-            self.group.units * self.network.global_max_ticks
+        self._OUTPUT_HISTORY_BYTES = ((MLPConstants.ACTIV_SIZE // 8) *
+            self._units * self.network.global_max_ticks)
 
         # recording info region size
         if self.group.output_grp:
             # number of recording channels
-            NUM_REC_CHANNS = len(MLPVarSizeRecordings) + \
-                len(MLPConstSizeRecordings)
+            NUM_REC_CHANNS = (len(MLPVarSizeRecordings) +
+                              len(MLPConstSizeRecordings))
 
-            # first output group has extra recording channels
-            if self.group.is_first_out:
+            # first output group/subgroup has extra recording channels
+            if self._is_first_out:
                 # number of extra recording channels
                 NUM_REC_CHANNS += len(MLPExtraRecordings)
 
-            self._REC_INFO_BYTES = \
-                recording_utilities.get_recording_header_size(NUM_REC_CHANNS)
+            self._REC_INFO_BYTES = (
+                recording_utilities.get_recording_header_size(NUM_REC_CHANNS))
         else:
             self._REC_INFO_BYTES = 0
 
@@ -178,7 +169,7 @@ def __init__(self,
         if self.group.output_grp:
             # list of variable-size recording channel sizes
             self.VAR_CHANNEL_SIZES = [
-                self.group.units * (BYTES_PER_WORD // 2)  # OUTPUTS
+                self._units * (BYTES_PER_WORD // 2)  # OUTPUTS
                 ]
 
             # list of constant-size recording channel sizes
@@ -187,7 +178,7 @@ def __init__(self,
                 ]
 
             # list of extra recording channel sizes
-            if self.group.is_first_out:
+            if self._is_first_out:
                 # list of extra recording channel sizes
                 self.EXTRA_CHANNEL_SIZES = [
                     4 * BYTES_PER_WORD  # TICK_DATA
@@ -206,15 +197,15 @@ def __init__(self,
         # configuration data plus application core SDRAM usage
         self._sdram_fixed = (
             SYSTEM_BYTES_REQUIREMENT +
-            self._N_NETWORK_CONFIGURATION_BYTES +
-            self._N_CORE_CONFIGURATION_BYTES +
-            self._N_EXAMPLE_SET_BYTES +
-            self._N_EXAMPLES_BYTES +
-            self._N_EVENTS_BYTES +
-            self._N_INPUTS_BYTES +
-            self._N_TARGETS_BYTES +
-            self._N_KEYS_BYTES +
-            self._N_STAGE_CONFIGURATION_BYTES +
+            self._NETWORK_CONFIGURATION_BYTES +
+            self._CORE_CONFIGURATION_BYTES +
+            self._EXAMPLE_SET_BYTES +
+            self._EXAMPLES_BYTES +
+            self._EVENTS_BYTES +
+            self._INPUTS_BYTES +
+            self._TARGETS_BYTES +
+            self._KEYS_BYTES +
+            self._STAGE_CONFIGURATION_BYTES +
             self._TARGET_HISTORY_BYTES +
             self._OUT_DERIV_HISTORY_BYTES +
             self._NET_HISTORY_BYTES +
@@ -236,6 +227,10 @@ def network (self):
     def group (self):
         return self._group
 
+    @property
+    def subgroup (self):
+        return self._subgroup
+
     @property
     def fwd_link (self):
         return self._fwd_link
@@ -257,9 +252,8 @@ def config (self):
             {
               uchar         output_grp;
               uchar         input_grp;
+              uchar         is_last_sgrp;
               uint          num_units;
-              uint          partitions;
-              uint          write_blk;
               uchar         hard_clamp_en;
               uchar         out_integr_en;
               fpreal        out_integr_dt;
@@ -269,39 +263,53 @@ def config (self):
               activation_t  initOutput;
               error_t       tst_group_criterion;
               error_t       trn_group_criterion;
+              uint          crit_expected;
               uchar         criterion_function;
-              uchar         is_first_output_group;
-              uchar         is_last_output_group;
+              uchar         is_first_output;
+              uchar         is_last_output;
               uchar         error_function;
             } t_conf_t;
 
             pack: standard sizes, little-endian byte order,
             explicit padding
         """
+        # is this the last subgroup in its group
+        last_sgrp = (self.subgroup == (self.group.subgroups - 1))
+
         # integration dt is an MLP fixed-point fpreal
-        out_integr_dt = int (self._out_integr_dt *\
+        out_integr_dt = int (self._out_integr_dt *
                               (1 << MLPConstants.FPREAL_SHIFT))
 
         # weak_clamp_strength is an MLP fixed-point fpreal
-        weak_clamp_strength = int (self.group.weak_clamp_strength *\
+        weak_clamp_strength = int (self.group.weak_clamp_strength *
                            (1 << MLPConstants.FPREAL_SHIFT))
 
         # init output is an MLP fixed-point activation_t
-        init_output = int (self.group.init_output *\
+        init_output = int (self.group.init_output *
                            (1 << MLPConstants.ACTIV_SHIFT))
 
         # group criteria are MLP fixed-point error_t
-        tst_group_criterion = int (self._tst_group_criterion *\
+        tst_group_criterion = int (self._tst_group_criterion *
                                 (1 << MLPConstants.ERROR_SHIFT))
-        trn_group_criterion = int (self._trn_group_criterion *\
+        trn_group_criterion = int (self._trn_group_criterion *
                                 (1 << MLPConstants.ERROR_SHIFT))
 
-        return struct.pack ("<2B2x3I2B2xi6I4i4B",
+        # criterion packets to be expected
+        if last_sgrp:
+            # expect from every other subgroup
+            crit_expected = self.group.subgroups - 1
+
+            # last group also expects from every other group
+            if self._is_last_out:
+                crit_expected += len (self.network.groups) - 1
+        else:
+            crit_expected = 0
+
+        return struct.pack ("<3BxI2B2xi6I4iI4B",
                             self.group.output_grp,
                             self.group.input_grp,
-                            self.group.units,
-                            self.group.partitions,
-                            self.group.write_blk,
+                            last_sgrp,
+                            self._units,
                             self.group.hard_clamp_en,
                             self.group.out_integr_en,
                             out_integr_dt,
@@ -315,9 +323,10 @@ def config (self):
                             init_output,
                             tst_group_criterion,
                             trn_group_criterion,
+                            crit_expected,
                             self.group.criterion_function.value,
-                            self.group.is_first_out,
-                            self._is_last_output_group,
+                            self._is_first_out,
+                            self._is_last_out,
                             self.group.error_function.value
                             )
 
@@ -337,7 +346,7 @@ def resources_required (self):
 
     @overrides (AbstractProvidesNKeysForPartition.get_n_keys_for_partition)
     def get_n_keys_for_partition (self, partition, graph_mapper):
-        return self._n_keys
+        return MLPConstants.KEY_SPACE_SIZE
 
 
     def read(self, placement, buffer_manager, channel):
@@ -372,7 +381,7 @@ def generate_machine_data_specification(
 
         # reserve and write the network configuration region
         spec.reserve_memory_region (MLPRegions.NETWORK.value,
-                                    self._N_NETWORK_CONFIGURATION_BYTES)
+                                    self._NETWORK_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.NETWORK.value)
 
@@ -382,7 +391,7 @@ def generate_machine_data_specification(
 
         # reserve and write the core configuration region
         spec.reserve_memory_region (MLPRegions.CORE.value,
-                                    self._N_CORE_CONFIGURATION_BYTES)
+                                    self._CORE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.CORE.value)
 
@@ -392,7 +401,7 @@ def generate_machine_data_specification(
 
         # reserve and write the example set region
         spec.reserve_memory_region (MLPRegions.EXAMPLE_SET.value,
-                                    self._N_EXAMPLE_SET_BYTES)
+                                    self._EXAMPLE_SET_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLE_SET.value)
 
@@ -402,7 +411,7 @@ def generate_machine_data_specification(
 
         # reserve and write the examples region
         spec.reserve_memory_region (MLPRegions.EXAMPLES.value,
-                                    self._N_EXAMPLES_BYTES)
+                                    self._EXAMPLES_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLES.value)
 
@@ -413,7 +422,7 @@ def generate_machine_data_specification(
 
         # reserve and write the events region
         spec.reserve_memory_region (MLPRegions.EVENTS.value,
-                                    self._N_EVENTS_BYTES)
+                                    self._EVENTS_BYTES)
 
         spec.switch_write_focus (MLPRegions.EVENTS.value)
 
@@ -423,47 +432,54 @@ def generate_machine_data_specification(
                 spec.write_value (c, data_type = DataType.UINT8)
 
         # reserve and write the input data region (if INPUT group)
-        if self._N_INPUTS_BYTES != 0:
+        if self.group.input_grp:
             spec.reserve_memory_region (MLPRegions.INPUTS.value,
-                                        self._N_INPUTS_BYTES)
+                                        self._INPUTS_BYTES)
 
             spec.switch_write_focus (MLPRegions.INPUTS.value)
 
             # write inputs to spec
-            for _i in self._group.inputs:
-                # inputs are MLP fixed-point activation_t
-                #NOTE: check for absent or NaN
-                if (_i is None) or (_i != _i):
-                    _inp = MLPConstants.ACTIV_NaN
-                else:
-                    _inp = int (_i * (1 << MLPConstants.ACTIV_SHIFT))
-                spec.write_value (_inp, data_type = DataType.UINT32)
+            us = self.subgroup * MLPConstants.MAX_SUBGROUP_UNITS
+            for _ in range (len (self.group.inputs) // self.group.units):
+                for i in self.group.inputs[us : us + self._units]:
+                    # inputs are fixed-point activation_t
+                    #NOTE: check for absent or NaN
+                    if (i is None) or (i != i):
+                        inp = MLPConstants.ACTIV_NaN
+                    else:
+                        inp = int (i * (1 << MLPConstants.ACTIV_SHIFT))
+                    spec.write_value (inp, data_type = DataType.UINT32)
+                us += self.group.units
 
         # reserve and write the target data region
-        if self._N_TARGETS_BYTES != 0:
+        if self.group.output_grp:
             spec.reserve_memory_region (MLPRegions.TARGETS.value,
-                                        self._N_TARGETS_BYTES)
+                                        self._TARGETS_BYTES)
 
             spec.switch_write_focus (MLPRegions.TARGETS.value)
 
             # write targets to spec
-            for _t in self._group.targets:
-                # targets are MLP fixed-point activation_t
-                #NOTE: check for absent or NaN
-                if (_t is None) or (_t != _t):
-                    _tgt = MLPConstants.ACTIV_NaN
-                else:
-                    _tgt = int (_t * (1 << MLPConstants.ACTIV_SHIFT))
-                spec.write_value (_tgt, data_type = DataType.UINT32)
+            us = self.subgroup * MLPConstants.MAX_SUBGROUP_UNITS
+            for _ in range (len (self.group.targets) // self.group.units):
+                for t in self.group.targets[us : us + self._units]:
+                    # inputs are fixed-point activation_t
+                    #NOTE: check for absent or NaN
+                    if (t is None) or (t != t):
+                        tgt = MLPConstants.ACTIV_NaN
+                    else:
+                        tgt = int (t * (1 << MLPConstants.ACTIV_SHIFT))
+                    spec.write_value (tgt, data_type = DataType.UINT32)
+                us += self.group.units
 
         # reserve and write the routing region
         spec.reserve_memory_region (MLPRegions.ROUTING.value,
-                                    self._N_KEYS_BYTES)
+                                    self._KEYS_BYTES)
 
         spec.switch_write_focus (MLPRegions.ROUTING.value)
 
-        # write link keys: fwd (padding - keys written below)
-        spec.write_value (0, data_type = DataType.UINT32)
+        # write link keys: fwd
+        spec.write_value (routing_info.get_first_key_from_pre_vertex (
+            self, self.fwd_link), data_type = DataType.UINT32)
 
         # write link keys: bkp
         spec.write_value (routing_info.get_first_key_from_pre_vertex (
@@ -473,24 +489,15 @@ def generate_machine_data_specification(
         spec.write_value (0, data_type = DataType.UINT32)
 
         # write link keys: stp
-        # stop key for OUTPUT groups only
-        if self.group.output_grp:
-            spec.write_value (routing_info.get_first_key_from_pre_vertex (
-                self, self.stp_link), data_type = DataType.UINT32)
-        else:
-            spec.write_value (0, data_type = DataType.UINT32)
+        spec.write_value (routing_info.get_first_key_from_pre_vertex (
+            self, self.stp_link), data_type = DataType.UINT32)
 
         # write link keys: lds (padding)
         spec.write_value (0, data_type = DataType.UINT32)
 
-        # write link keys: fwdt
-        for p in range (self.group.partitions):
-            spec.write_value (routing_info.get_first_key_from_pre_vertex (
-                self, self.fwd_link[p]), data_type = DataType.UINT32)
-
         # reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
@@ -504,11 +511,11 @@ def generate_machine_data_specification(
                 region = MLPRegions.REC_INFO.value,
                 size = self._REC_INFO_BYTES
                 )
-    
+
             # write the actual recording channel sizes for a stage
             _sizes = [data_n_steps * sz for sz in self.VAR_CHANNEL_SIZES]
             _sizes.extend([sz for sz in self.CONST_CHANNEL_SIZES])
-            if self.group.is_first_out:
+            if self._is_first_out:
                 _sizes.extend(
                     [data_n_steps * sz for sz in self.EXTRA_CHANNEL_SIZES]
                     )
@@ -525,7 +532,7 @@ def generate_machine_data_specification(
     def regenerate_data_specification(self, spec, placement):
         # reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
@@ -557,7 +564,7 @@ def get_recorded_region_ids(self):
             ids.extend([ch.value for ch in MLPConstSizeRecordings])
 
             # first output group has additional recording channels
-            if self.group.is_first_out:
+            if self._is_first_out:
                 ids.extend([ch.value for ch in MLPExtraRecordings])
 
             return ids
diff --git a/spinn_pdp2/weight_vertex.py b/spinn_pdp2/weight_vertex.py
index b0b9aed..aa60daf 100644
--- a/spinn_pdp2/weight_vertex.py
+++ b/spinn_pdp2/weight_vertex.py
@@ -38,130 +38,106 @@ class WeightVertex(
     def __init__(self,
                  network,
                  group,
+                 subgroup,
                  from_group,
-                 col_blk,
-                 row_blk
+                 from_subgroup
                  ):
 
+        self._network       = network
+        self._group         = group
+        self._from_group    = from_group
+        self._subgroup      = subgroup
+        self._from_subgroup = from_subgroup
+
         super(WeightVertex, self).__init__(
-            label = f"w_core{group.id}_{from_group.id}_{row_blk}_{col_blk}",
+            label = (f"w_core{self.group.id}/{self.subgroup}"
+                     f"_{self.from_group.id}/{self.from_subgroup}"),
             binary_name = "weight.aplx",
             constraints = None)
 
         self._stage = 0
 
         # application-level data
-        self._network    = network
-        self._group      = group
-        self._from_group = from_group
-        self._col_blk    = col_blk
-        self._row_blk    = row_blk
-        self._set_cfg    = network._ex_set.set_config
-        self._ex_cfg     = network._ex_set.example_config
-
-        # compute number of rows and columns
-        if self._row_blk != (self.from_group.partitions - 1):
-            self._num_rows = MLPConstants.MAX_BLK_UNITS
-        else:
-            _r = self.from_group.units % MLPConstants.MAX_BLK_UNITS
-            if _r == 0:
-                self._num_rows = MLPConstants.MAX_BLK_UNITS
-            else:
-                self._num_rows = _r
-
-        if self._col_blk != (self.group.partitions - 1):
-            self._num_cols = MLPConstants.MAX_BLK_UNITS
-        else:
-            _r = self.group.units % MLPConstants.MAX_BLK_UNITS
-            if _r == 0:
-                self._num_cols = MLPConstants.MAX_BLK_UNITS
-            else:
-                self._num_cols = _r
-
-        # forward, backprop and link delta summation link partition names
-        self._fwd_link = "fwd_w{}_{}".format (self.group.id,
-                                              self.from_group.id)
-        self._bkp_link = "bkp_w{}_{}".format (self.group.id,
-                                              self.from_group.id)
-        self._lds_link = "lds_w{}_{}".format (self.group.id,
-                                              self.from_group.id)
-
-        # reserve key space for every link
-        self._n_keys = MLPConstants.KEY_SPACE_SIZE
+        self._set_cfg = self.network.ex_set.set_config
+        self._ex_cfg  = self.network.ex_set.example_config
 
-        # choose weight core-specific parameters
+        # application parameters
         if len (self.group.weights[self.from_group]):
             if self.group.learning_rate is not None:
-                self.learning_rate = self.group.learning_rate
-            elif network._learning_rate is not None:
-                self.learning_rate = network._learning_rate
+                self._learning_rate = self.group.learning_rate
+            elif network.learning_rate is not None:
+                self._learning_rate = network.learning_rate
             else:
-                self.learning_rate = MLPConstants.DEF_LEARNING_RATE
+                self._learning_rate = MLPConstants.DEF_LEARNING_RATE
 
             if self.group.weight_decay is not None:
-                self.weight_decay = self.group.weight_decay
-            elif network._weight_decay is not None:
-                self.weight_decay = network._weight_decay
+                self._weight_decay = self.group.weight_decay
+            elif network.weight_decay is not None:
+                self._weight_decay = network.weight_decay
             else:
-                self.weight_decay = MLPConstants.DEF_WEIGHT_DECAY
+                self._weight_decay = MLPConstants.DEF_WEIGHT_DECAY
 
             if self.group.momentum is not None:
-                self.momentum = self.group.momentum
-            elif network._momentum is not None:
-                self.momentum = network._momentum
+                self._momentum = self.group.momentum
+            elif network.momentum is not None:
+                self._momentum = network.momentum
             else:
-                self.momentum = MLPConstants.DEF_MOMENTUM
+                self._momentum = MLPConstants.DEF_MOMENTUM
         else:
-            self.learning_rate = 0
-            self.weight_decay = 0
-            self.momentum = 0
+            self._learning_rate = 0
+            self._weight_decay = 0
+            self._momentum = 0
 
-        # weight update function
-        self.update_function = network._update_function
+        # forward, backprop and link delta summation link names
+        self._fwd_link = (f"fwd_w{self.group.id}/{self.subgroup}"
+                          f"_{self.from_group.id}/{self.from_subgroup}")
 
-        # configuration and data files
-        # find out the size of an integer!
-        _data_int = DataType.INT32
+        self._bkp_link = (f"bkp_w{self.group.id}/{self.subgroup}"
+                          f"_{self.from_group.id}/{self.from_subgroup}")
 
+        self._lds_link = (f"lds_w{self.group.id}/{self.subgroup}"
+                          f"_{self.from_group.id}/{self.from_subgroup}")
+
+        # weight core-specific parameters
+        # weight matrix parameters
+        self._num_rows = self.from_group.subunits[self.from_subgroup]
+        self._num_cols = self.group.subunits[self.subgroup]
+
+        # configuration and data sizes
         # network configuration structure
-        self._N_NETWORK_CONFIGURATION_BYTES = \
-            len (self._network.network_config)
+        self._NETWORK_CONFIGURATION_BYTES = len (self.network.network_config)
 
         # core configuration structure
-        self._N_CORE_CONFIGURATION_BYTES = \
-            len (self.config)
+        self._CORE_CONFIGURATION_BYTES = len (self.config)
 
         # set configuration structure
-        self._N_EXAMPLE_SET_BYTES = \
-            len (self._set_cfg)
+        self._EXAMPLE_SET_BYTES = len (self._set_cfg)
 
         # list of example configurations
-        self._N_EXAMPLES_BYTES = \
-            len (self._ex_cfg) * len (self._ex_cfg[0])
+        self._EXAMPLES_BYTES = len (self._ex_cfg) * len (self._ex_cfg[0])
 
         # each weight is an integer
-        self._N_WEIGHTS_BYTES = \
-            self.group.units * self.from_group.units * _data_int.size
+        self._WEIGHTS_BYTES = (self._num_rows *
+                               self._num_cols * DataType.INT32.size)
 
-        # keys are integers
-        self._N_KEYS_BYTES = MLPConstants.NUM_KEYS_REQ * _data_int.size
+        # list of routing keys
+        self._KEYS_BYTES = MLPConstants.NUM_KEYS_REQ * DataType.INT32.size
 
         # stage configuration structure
-        self._N_STAGE_CONFIGURATION_BYTES = \
-            len (self._network.stage_config)
+        self._STAGE_CONFIGURATION_BYTES = len (self.network.stage_config)
 
         # reserve SDRAM space used to store historic data
-        self._OUTPUT_HISTORY_BYTES = (MLPConstants.ACTIV_SIZE // 8) * \
-            self.group.units * self._network.global_max_ticks
+        self._OUTPUT_HISTORY_BYTES = ((MLPConstants.ACTIV_SIZE // 8) *
+            self.group.units * self.network.global_max_ticks)
 
         self._sdram_usage = (
-            self._N_NETWORK_CONFIGURATION_BYTES + \
-            self._N_CORE_CONFIGURATION_BYTES + \
-            self._N_EXAMPLE_SET_BYTES + \
-            self._N_EXAMPLES_BYTES + \
-            self._N_WEIGHTS_BYTES + \
-            self._N_KEYS_BYTES + \
-            self._N_STAGE_CONFIGURATION_BYTES + \
+            self._NETWORK_CONFIGURATION_BYTES +
+            self._CORE_CONFIGURATION_BYTES +
+            self._EXAMPLE_SET_BYTES +
+            self._EXAMPLES_BYTES +
+            self._WEIGHTS_BYTES +
+            self._KEYS_BYTES +
+            self._STAGE_CONFIGURATION_BYTES +
             self._OUTPUT_HISTORY_BYTES
         )
 
@@ -189,21 +165,25 @@ def cast_float_to_weight (self,
         # return an MLP fixed-point weight_t
         return (int (wtemp * (1 << MLPConstants.WEIGHT_SHIFT)))
 
+    @property
+    def network (self):
+        return self._network
+
     @property
     def group (self):
         return self._group
 
     @property
-    def from_group (self):
-        return self._from_group
+    def subgroup (self):
+        return self._subgroup
 
     @property
-    def row_blk (self):
-        return self._row_blk
+    def from_group (self):
+        return self._from_group
 
     @property
-    def col_blk (self):
-        return self._col_blk
+    def from_subgroup (self):
+        return self._from_subgroup
 
     @property
     def fwd_link (self):
@@ -226,8 +206,6 @@ def config (self):
             {
               uint           num_rows;
               uint           num_cols;
-              uint           row_blk;
-              uint           col_blk;
               scoreboard_t   sync_expected;
               activation_t   initOutput;
               short_fpreal_t learningRate;
@@ -239,32 +217,30 @@ def config (self):
             explicit padding
         """
         # expect one sync packet from 'group' and one from 'from_group'
-        if self._group == self._from_group:
+        if self.group == self.from_group and self.subgroup == self.from_subgroup:
             sync_expected = 1
         else:
             sync_expected = 2
 
         # init output is an MLP fixed-point activation_t
-        init_output = int (self._from_group.init_output *\
+        init_output = int (self.from_group.init_output *\
                            (1 << MLPConstants.ACTIV_SHIFT))
 
         # learning_rate is an MLP short fixed-point fpreal
-        learning_rate = int (self.learning_rate *\
+        learning_rate = int (self._learning_rate *\
                               (1 << MLPConstants.SHORT_FPREAL_SHIFT))
 
         # weight_decay is an MLP short fixed-point fpreal
-        weight_decay = int (self.weight_decay *\
+        weight_decay = int (self._weight_decay *\
                               (1 << MLPConstants.SHORT_FPREAL_SHIFT))
 
         # momentum is an MLP short fixed-point fpreal
-        momentum = int (self.momentum *\
+        momentum = int (self._momentum *\
                               (1 << MLPConstants.SHORT_FPREAL_SHIFT))
 
-        return struct.pack ("<5Ii3h2x",
+        return struct.pack ("<3Ii3h2x",
                             self._num_rows,
                             self._num_cols,
-                            self._row_blk,
-                            self._col_blk,
                             sync_expected,
                             init_output,
                             learning_rate,
@@ -283,7 +259,7 @@ def resources_required (self):
 
     @overrides (AbstractProvidesNKeysForPartition.get_n_keys_for_partition)
     def get_n_keys_for_partition (self, partition, graph_mapper):
-        return self._n_keys
+        return MLPConstants.KEY_SPACE_SIZE
 
 
     @overrides(MachineDataSpecableVertex.generate_machine_data_specification)
@@ -296,17 +272,17 @@ def generate_machine_data_specification(
 
         # Reserve and write the network configuration region
         spec.reserve_memory_region (MLPRegions.NETWORK.value,
-                                    self._N_NETWORK_CONFIGURATION_BYTES)
+                                    self._NETWORK_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.NETWORK.value)
 
         # write the network configuration into spec
-        for c in self._network.network_config:
+        for c in self.network.network_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         # Reserve and write the core configuration region
         spec.reserve_memory_region (MLPRegions.CORE.value,
-                                    self._N_CORE_CONFIGURATION_BYTES)
+                                    self._CORE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.CORE.value)
 
@@ -316,7 +292,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the example set region
         spec.reserve_memory_region (MLPRegions.EXAMPLE_SET.value,
-                                    self._N_EXAMPLE_SET_BYTES)
+                                    self._EXAMPLE_SET_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLE_SET.value)
 
@@ -326,7 +302,7 @@ def generate_machine_data_specification(
 
         # Reserve and write the examples region
         spec.reserve_memory_region (MLPRegions.EXAMPLES.value,
-                                    self._N_EXAMPLES_BYTES)
+                                    self._EXAMPLES_BYTES)
 
         spec.switch_write_focus (MLPRegions.EXAMPLES.value)
 
@@ -337,31 +313,29 @@ def generate_machine_data_specification(
 
         # Reserve and write the weights region
         spec.reserve_memory_region (MLPRegions.WEIGHTS.value,
-                                    self._N_WEIGHTS_BYTES)
+                                    self._WEIGHTS_BYTES)
 
         spec.switch_write_focus (MLPRegions.WEIGHTS.value)
 
         # weight matrix is kept in column-major order
         # and has to be written out in row-major order
-        _wts = self.group.weights[self.from_group]
-        _nrows = self.from_group.units
-        _nr = self._num_rows
-        _nc = self._num_cols
-        _rb = self._row_blk * MLPConstants.MAX_BLK_UNITS
-        _cb = self._col_blk * MLPConstants.MAX_BLK_UNITS
-        if len (_wts):
-            for _r in range (_nr):
-                for _c in range (_nc):
-                    _wt = self.cast_float_to_weight (
-                        _wts[(_cb + _c) * _nrows + (_rb + _r)])
-                    spec.write_value (_wt, data_type = DataType.INT32)
+        wts = self.group.weights[self.from_group]
+        rows_per_col = self.from_group.units
+        rb = self.from_subgroup * MLPConstants.MAX_SUBGROUP_UNITS
+        cb = self.subgroup * MLPConstants.MAX_SUBGROUP_UNITS
+        if len (wts):
+            for r in range (self._num_rows):
+                for c in range (self._num_cols):
+                    wt = self.cast_float_to_weight (
+                        wts[(cb + c) * rows_per_col + (rb + r)])
+                    spec.write_value (wt, data_type = DataType.INT32)
         else:
-            for _ in range (_nr * _nc):
+            for _ in range (self._num_rows * self._num_cols):
                 spec.write_value (0, data_type = DataType.INT32)
 
         # Reserve and write the routing region
         spec.reserve_memory_region (MLPRegions.ROUTING.value,
-                                    self._N_KEYS_BYTES)
+                                    self._KEYS_BYTES)
 
         spec.switch_write_focus (MLPRegions.ROUTING.value)
 
@@ -385,12 +359,12 @@ def generate_machine_data_specification(
 
         # Reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
         # write the stage configuration into spec
-        for c in self._network.stage_config:
+        for c in self.network.stage_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         spec.end_specification ()
@@ -400,12 +374,12 @@ def generate_machine_data_specification(
     def regenerate_data_specification(self, spec, placement):
         # Reserve and write the stage configuration region
         spec.reserve_memory_region (MLPRegions.STAGE.value,
-                                    self._N_STAGE_CONFIGURATION_BYTES)
+                                    self._STAGE_CONFIGURATION_BYTES)
 
         spec.switch_write_focus (MLPRegions.STAGE.value)
 
         # write the stage configuration into spec
-        for c in self._network.stage_config:
+        for c in self.network.stage_config:
             spec.write_value (c, data_type = DataType.UINT8)
 
         spec.end_specification()