From eab7ca086bfa1274ed68d17d6e2ee6c43f5fd4e0 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Tue, 26 Oct 2021 01:32:07 -0700
Subject: [PATCH 01/11] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3f81518f..b90bed69 100644
--- a/README.md
+++ b/README.md
@@ -520,3 +520,4 @@ If Gemmini helps you in your academic research, you are encouraged to cite our p
 # Acknowledgements
 
 - The Gemmini [logo](./img/full-logo.svg) was designed by Dima Nikiforov ([@CobbledSteel](https://github.com/CobbledSteel)).
+- This project was, in part, funded by the U.S. Government under the DARPA RTML program (contract FA8650-20-2-7006). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the U.S. Government.

From af73a9517f1c60870dbce977791157f8be326e7b Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Tue, 26 Oct 2021 09:23:09 -0700
Subject: [PATCH 02/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b90bed69..428a0dad 100644
--- a/README.md
+++ b/README.md
@@ -519,5 +519,5 @@ If Gemmini helps you in your academic research, you are encouraged to cite our p
 
 # Acknowledgements
 
-- The Gemmini [logo](./img/full-logo.svg) was designed by Dima Nikiforov ([@CobbledSteel](https://github.com/CobbledSteel)).
 - This project was, in part, funded by the U.S. Government under the DARPA RTML program (contract FA8650-20-2-7006). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the U.S. Government.
+- The Gemmini [logo](./img/full-logo.svg) was designed by Dima Nikiforov ([@CobbledSteel](https://github.com/CobbledSteel)).

From d42745c70499acdf255a5f5929986237b812c8ad Mon Sep 17 00:00:00 2001
From: "Ruohan (Richard) Yan" <richardyrh928@gmail.com>
Date: Thu, 28 Oct 2021 21:53:39 -0700
Subject: [PATCH 03/11] Gemmini ISA Bundles (#149)

* wip load/store

* created & parameterized bundles

* fix config ex rs1

* optimize loopconv & loopmatmul with bundles; add bundles for preload and compute

* move assignments to pipeline output

Co-authored-by: Ruohan Yan <yrh@a5.Millennium.Berkeley.EDU>
---
 src/main/scala/gemmini/Controller.scala       |  12 +-
 .../scala/gemmini/ExecuteController.scala     |  22 +-
 src/main/scala/gemmini/GemminiISA.scala       | 170 ++++++++++++++
 src/main/scala/gemmini/LoadController.scala   |  23 +-
 src/main/scala/gemmini/LoopConv.scala         | 211 ++++++++++++++----
 src/main/scala/gemmini/LoopMatmul.scala       |  99 ++++++--
 src/main/scala/gemmini/StoreController.scala  |  54 ++---
 7 files changed, 481 insertions(+), 110 deletions(-)

diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index cdb708f1..e63d7451 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -127,7 +127,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   // TODO replace 4,12,2 with parameters based on ROB size
   val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
-    inputType.getWidth, accType.getWidth, dma_maxbytes)
+    inputType.getWidth, accType.getWidth, dma_maxbytes,
+    new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
+    new ConfigMvoutRs2(acc_scale_t_bits, 32), new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t),
+    new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
+    new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
+    new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t))
 
   // val (compressed_cmd, compressor_busy) = InstCompressor(unrolled_cmd)
   // compressed_cmd.ready := false.B
@@ -136,7 +141,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
-    inputType.getWidth, accType.getWidth, dma_maxbytes)
+    inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
+    new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
+    new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
+    new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t))
 
   val unrolled_cmd = Queue(loop_cmd)
   unrolled_cmd.ready := false.B
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 2b790b7b..db9a894e 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -538,27 +538,29 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
       when(cmd.valid(0))
       {
         when(DoConfig && !matmul_in_progress && !pending_completed_rob_ids.map(_.valid).reduce(_ || _)) {
+          val config_ex_rs1 = rs1s(0).asTypeOf(new ConfigExRs1(acc_scale_t_bits))
+          val config_ex_rs2 = rs2s(0).asTypeOf(new ConfigExRs2)
+
           val config_cmd_type = rs1s(0)(1,0) // TODO magic numbers
 
           when (config_cmd_type === CONFIG_EX) {
-            val set_only_strides = rs1s(0)(7) // TODO magic number
+            val set_only_strides = config_ex_rs1.set_only_strides
 
             when (!set_only_strides) {
-              activation := rs1s(0)(4, 3) // TODO magic number
-              in_shift := rs2s(0)(31, 0) // TODO magic number
+              activation := config_ex_rs1.activation
+              in_shift := config_ex_rs2.in_shift
               acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_args.multiplicand_t) // TODO magic number
-              relu6_shift := rs2s(0)(47, 32) // TODO magic number
-              a_transpose := rs1s(0)(8) // TODO magic number
-              bd_transpose := rs1s(0)(9) // TODO magic number
+              relu6_shift := config_ex_rs2.relu6_shift
+              a_transpose := config_ex_rs1.a_transpose
+              bd_transpose := config_ex_rs1.b_transpose
 
               if (dataflow == Dataflow.BOTH) {
-                current_dataflow := rs1s(0)(2) // TODO magic number
+                current_dataflow := config_ex_rs1.dataflow
               }
             }
 
-            a_addr_stride := rs1s(0)(31, 16) // TODO magic number // TODO this needs to be kept in sync with ROB.scala
-            c_addr_stride := rs2s(0)(63, 48) // TODO magic number // TODO this needs to be kept in sync with ROB.scala
-
+            a_addr_stride := config_ex_rs1.a_stride // TODO this needs to be kept in sync with ROB.scala
+            c_addr_stride := config_ex_rs2.c_stride // TODO this needs to be kept in sync with ROB.scala
             config_initialized := true.B
           }.otherwise { // config_cmd_type === CONFIG_IM2COL
             ocol := cmd.bits(0).cmd.rs2(63, 56)
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index f9d7a1ba..554bcdeb 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -56,4 +57,173 @@ object GemminiISA {
   // dataflow configuration
   //==========================================================================
   val GARBAGE_ADDR      = "hffffffff".U(32.W)
+
+  val MVIN_RS2_ADDR_WIDTH = 32
+  val MVIN_RS2_COLS_WIDTH = 16
+  val MVIN_RS2_ROWS_WIDTH = 16
+
+  class MvinRs2(mvin_rows_bits: Int, mvin_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle {
+    val _spacer2 = UInt((MVIN_RS2_ROWS_WIDTH - mvin_rows_bits).W)
+    val num_rows = UInt(mvin_rows_bits.W)
+    val _spacer1 = UInt((MVIN_RS2_COLS_WIDTH - mvin_cols_bits).W)
+    val num_cols = UInt(mvin_cols_bits.W)
+    val _spacer0 = UInt((MVIN_RS2_ADDR_WIDTH - local_addr_t.getWidth).W)
+    val local_addr = local_addr_t.cloneType
+
+    override def cloneType: MvinRs2.this.type =
+      (new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t)).asInstanceOf[this.type]
+  }
+
+  val MVOUT_RS2_ADDR_WIDTH = 32
+  val MVOUT_RS2_COLS_WIDTH = 16
+  val MVOUT_RS2_ROWS_WIDTH = 16
+
+  class MvoutRs2(mvout_rows_bits: Int, mvout_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle {
+    val _spacer2 = UInt((MVOUT_RS2_ROWS_WIDTH - mvout_rows_bits).W)
+    val num_rows = UInt(mvout_rows_bits.W)
+    val _spacer1 = UInt((MVOUT_RS2_COLS_WIDTH - mvout_cols_bits).W)
+    val num_cols = UInt(mvout_cols_bits.W)
+    val _spacer0 = UInt((MVOUT_RS2_ADDR_WIDTH - local_addr_t.getWidth).W)
+    val local_addr = local_addr_t.cloneType
+
+    override def cloneType: MvoutRs2.this.type =
+      (new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)).asInstanceOf[this.type]
+  }
+
+  val CONFIG_MVIN_RS1_UNUSED_WIDTH = 2
+  val CONFIG_MVIN_RS1_SHRINK_WIDTH = 1
+  val CONFIG_MVIN_RS1_STATE_ID_WIDTH = 2
+  val CONFIG_MVIN_RS1_SPACER_WIDTH = (16 - 2 - 1 - 2)
+  val CONFIG_MVIN_RS1_STRIDE_WIDTH = 16
+  val CONFIG_MVIN_RS1_SCALE_WIDTH = 32
+
+  class ConfigMvinRs1(scale_bits: Int, stride_bits: Int) extends Bundle {
+    val _spacer2 = UInt((CONFIG_MVIN_RS1_SCALE_WIDTH - scale_bits).W)
+    val scale = UInt(scale_bits.W)
+    val _spacer1 = UInt((CONFIG_MVIN_RS1_STRIDE_WIDTH - stride_bits).W)
+    val stride = UInt(stride_bits.W)
+    val _spacer0 = UInt(CONFIG_MVIN_RS1_SPACER_WIDTH.W)
+    val state_id = UInt(CONFIG_MVIN_RS1_STATE_ID_WIDTH.W)
+    val shrink = UInt(CONFIG_MVIN_RS1_SHRINK_WIDTH.W)
+    val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W)
+
+    override def cloneType: ConfigMvinRs1.this.type =
+      (new ConfigMvinRs1(scale_bits, stride_bits)).asInstanceOf[this.type]
+  }
+
+  val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2
+  val CONFIG_MVOUT_RS1_ACTIVATION_WIDTH = 2
+  val CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH = 2
+  val CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH = 2
+  val CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH = 2
+  val CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH = 2
+  val CONFIG_MVOUT_RS1_SPACER_WIDTH = (24 - 2 * 6)
+  val CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH = 8
+  val CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH = 8
+  val CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH = 8
+  val CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH = 8
+  val CONFIG_MVOUT_RS1_OUT_COLS_WIDTH = 8
+
+  class ConfigMvoutRs1 extends Bundle {
+    val ocols = UInt(CONFIG_MVOUT_RS1_OUT_COLS_WIDTH.W)
+    val orows = UInt(CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH.W)
+    val pocols = UInt(CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH.W)
+    val porows = UInt(CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH.W)
+    val pool_out_dim = UInt(CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH.W)
+    val _spacer = UInt(CONFIG_MVOUT_RS1_SPACER_WIDTH.W)
+    val lpad = UInt(CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH.W)
+    val upad = UInt(CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH.W)
+    val pool_size = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W)
+    val pool_stride = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W)
+    val activation = UInt(CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W)
+    val _unused = UInt(CONFIG_MVOUT_RS1_UNUSED_WIDTH.W)
+
+    override def cloneType: ConfigMvoutRs1.this.type = (new ConfigMvoutRs1).asInstanceOf[this.type]
+  }
+
+  val CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH = 32
+  val CONFIG_MVOUT_RS2_STRIDE_WIDTH = 32
+
+  class ConfigMvoutRs2(acc_scale_bits: Int, stride_bits: Int) extends Bundle {
+    val _spacer1 = UInt((CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH - acc_scale_bits).W)
+    val acc_scale = UInt(acc_scale_bits.W)
+    val _spacer0 = UInt((CONFIG_MVOUT_RS2_STRIDE_WIDTH - stride_bits).W)
+    val stride = UInt(stride_bits.W)
+
+    override def cloneType: ConfigMvoutRs2.this.type =
+      (new ConfigMvoutRs2(acc_scale_bits, stride_bits)).asInstanceOf[this.type]
+  }
+
+  val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2
+  val CONFIG_EX_RS1_DATAFLOW_WIDTH = 1
+  val CONFIG_EX_RS1_ACTIVATION_WIDTH = 2
+  val CONFIG_EX_RS1_SPACER0_WIDTH = (7 - 2 - 1 - 2)
+  val CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH = 1
+  val CONFIG_EX_RS1_A_TRANSPOSE_WIDTH = 1
+  val CONFIG_EX_RS1_B_TRANSPOSE_WIDTH = 1
+  val CONFIG_EX_RS1_SPACER1_WIDTH = (16 - 10)
+  val CONFIG_EX_RS1_A_STRIDE_WIDTH = 16
+  val CONFIG_EX_RS1_ACC_SCALE_WIDTH = 32
+
+  class ConfigExRs1(acc_scale_bits: Int) extends Bundle {
+    val _spacer2 = UInt((CONFIG_EX_RS1_ACC_SCALE_WIDTH - acc_scale_bits).W)
+    val acc_scale = UInt(acc_scale_bits.W)
+    val a_stride = UInt(CONFIG_EX_RS1_A_STRIDE_WIDTH.W)
+    val _spacer1 = UInt(CONFIG_EX_RS1_SPACER1_WIDTH.W)
+    val b_transpose = UInt(CONFIG_EX_RS1_B_TRANSPOSE_WIDTH.W)
+    val a_transpose = UInt(CONFIG_EX_RS1_A_TRANSPOSE_WIDTH.W)
+    val set_only_strides = UInt(CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH.W)
+    val _spacer0 = UInt(CONFIG_EX_RS1_SPACER0_WIDTH.W)
+    val activation = UInt(CONFIG_EX_RS1_ACTIVATION_WIDTH.W)
+    val dataflow = UInt(CONFIG_EX_RS1_DATAFLOW_WIDTH.W)
+    val cmd_type = UInt(CONFIG_EX_RS1_CMD_TYPE_WIDTH.W)
+
+    override def cloneType: ConfigExRs1.this.type =
+      (new ConfigExRs1(acc_scale_bits)).asInstanceOf[this.type]
+  }
+
+  val CONFIG_EX_RS2_IN_SHIFT_WIDTH = 32
+  val CONFIG_EX_RS2_RELU6_SHIFT_WIDTH = 16
+  val CONFIG_EX_RS2_C_STRIDE_WIDTH = 16
+
+  class ConfigExRs2 extends Bundle {
+    val c_stride = UInt(CONFIG_EX_RS2_C_STRIDE_WIDTH.W)
+    val relu6_shift = UInt(CONFIG_EX_RS2_RELU6_SHIFT_WIDTH.W)
+    val in_shift = UInt(CONFIG_EX_RS2_IN_SHIFT_WIDTH.W)
+
+    override def cloneType: ConfigExRs2.this.type = (new ConfigExRs2).asInstanceOf[this.type]
+  }
+
+  val PRELOAD_RS_ADDR_WIDTH = 32
+  val PRELOAD_RS_COLS_WIDTH = 16
+  val PRELOAD_RS_ROWS_WIDTH = 16
+
+  class PreloadRs(preload_rows_bits: Int, preload_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle {
+    val _spacer2 = UInt((PRELOAD_RS_ROWS_WIDTH - preload_rows_bits).W)
+    val num_rows = UInt(preload_rows_bits.W)
+    val _spacer1 = UInt((PRELOAD_RS_COLS_WIDTH - preload_cols_bits).W)
+    val num_cols = UInt(preload_cols_bits.W)
+    val _spacer0 = UInt((PRELOAD_RS_ADDR_WIDTH - local_addr_t.getWidth).W)
+    val local_addr = local_addr_t.cloneType
+
+    override def cloneType: PreloadRs.this.type =
+      (new PreloadRs(preload_rows_bits, preload_cols_bits, local_addr_t)).asInstanceOf[this.type]
+  }
+
+  val COMPUTED_RS_ADDR_WIDTH = 32
+  val COMPUTED_RS_COLS_WIDTH = 16
+  val COMPUTED_RS_ROWS_WIDTH = 16
+
+  class ComputeRs(compute_rows_bits: Int, compute_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle {
+    val _spacer2 = UInt((COMPUTED_RS_ROWS_WIDTH - compute_rows_bits).W)
+    val num_rows = UInt(compute_rows_bits.W)
+    val _spacer1 = UInt((COMPUTED_RS_COLS_WIDTH - compute_cols_bits).W)
+    val num_cols = UInt(compute_cols_bits.W)
+    val _spacer0 = UInt((COMPUTED_RS_ADDR_WIDTH - local_addr_t.getWidth).W)
+    val local_addr = local_addr_t.cloneType
+
+    override def cloneType: ComputeRs.this.type =
+      (new ComputeRs(compute_rows_bits, compute_cols_bits, local_addr_t)).asInstanceOf[this.type]
+  }
 }
+
diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala
index ccf26fc0..1c8b0ced 100644
--- a/src/main/scala/gemmini/LoadController.scala
+++ b/src/main/scala/gemmini/LoadController.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -8,7 +9,8 @@ import freechips.rocketchip.config.Parameters
 
 // TODO we need to check for WAW errors here
 // TODO deal with errors when reading scratchpad responses
-class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int, local_addr_t: LocalAddr)
+class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int,
+                                                      local_addr_t: LocalAddr)
                                (implicit p: Parameters) extends Module {
   import config._
 
@@ -36,20 +38,25 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   val row_counter = RegInit(0.U(log2Ceil(block_rows).W))
 
   val cmd = Queue(io.cmd, ld_queue_length)
+
   val vaddr = cmd.bits.cmd.rs1
-  val localaddr = cmd.bits.cmd.rs2.asTypeOf(local_addr_t)
-  val cols = cmd.bits.cmd.rs2(32 + mvin_cols_bits - 1, 32) // TODO magic numbers
-  val rows = cmd.bits.cmd.rs2(48 + mvin_rows_bits - 1, 48) // TODO magic numbers
+  val mvin_rs2 = cmd.bits.cmd.rs2.asTypeOf(new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t))
+  val localaddr = mvin_rs2.local_addr
+  val cols = mvin_rs2.num_cols
+  val rows = mvin_rs2.num_rows
+
   val config_stride = cmd.bits.cmd.rs2
-  val config_scale = cmd.bits.cmd.rs1(32 + mvin_scale_t_bits - 1, 32) // TODO magic numbers
-  val config_shrink = cmd.bits.cmd.rs1(2) // TODO magic numbers
-  val config_block_stride = cmd.bits.cmd.rs1(31, 16) // TODO magic numbers
+  val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits))
+
+  val config_scale = config_mvin_rs1.scale // maybe limit width to `mvin_scale_t_bits`?
+  val config_shrink = config_mvin_rs1.shrink
+  val config_block_stride = config_mvin_rs1.stride
 
   val mstatus = cmd.bits.cmd.status
 
   val load_state_id = MuxCase(0.U, Seq((cmd.bits.cmd.inst.funct === LOAD2_CMD) -> 1.U,
     (cmd.bits.cmd.inst.funct === LOAD3_CMD) -> 2.U))
-  val config_state_id = cmd.bits.cmd.rs1(4,3) // TODO magic numbers
+  val config_state_id = config_mvin_rs1.state_id
   val state_id = Mux(cmd.bits.cmd.inst.funct === CONFIG_CMD, config_state_id, load_state_id)
 
   val stride = strides(state_id)
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index f39faee1..749f00fe 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -74,7 +74,8 @@ class LoopConvLdBiasReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: I
 }
 
 class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_acc_addr: Int, acc_w: Int,
-                     max_block_len_acc: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module {
+                     max_block_len_acc: Int, concurrent_loops: Int, latency: Int,
+                     config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module {
   val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow
   val io = IO(new Bundle {
     val req = Flipped(Decoupled(new LoopConvLdBiasReq(coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth: Int, max_acc_addr, concurrent_loops)))
@@ -131,14 +132,23 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
   val config_cmd = Wire(new RoCCCommand)
   config_cmd := DontCare
   config_cmd.inst.funct := CONFIG_CMD
-  config_cmd.rs1 := (MVIN_SCALE_IDENTITY << 32.U) | (req.derived_params.bias_spad_stride << 16.U) | (2.U << 3) | 1.U
+
+  val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType)
+  config_cmd_rs1 := DontCare
+  config_cmd_rs1.scale := MVIN_SCALE_IDENTITY
+  config_cmd_rs1.stride := req.derived_params.bias_spad_stride
+  config_cmd_rs1.state_id := 2.U
+  config_cmd_rs1.shrink := 0.U
+  config_cmd_rs1._unused := 1.U
+  config_cmd.rs1 := config_cmd_rs1.asUInt
+
   config_cmd.rs2 := 0.U
 
   val mvin_cmd = Wire(new RoCCCommand)
   mvin_cmd := DontCare
   mvin_cmd.inst.funct := LOAD3_CMD
-  mvin_cmd.rs1 := 0.U //dram_addr
-  mvin_cmd.rs2 := 0.U //(I << 48.U) | (J << 32.U) | spad_addr
+  mvin_cmd.rs1 := 0.U
+  mvin_cmd.rs2 := 0.U
 
   // Inputs and outputs
   io.req.ready := state === idle && !command_p.io.busy
@@ -158,7 +168,12 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
   when (command_p.io.out.bits.cmd.inst.funct === LOAD3_CMD) {
     val o = command_p.io.out.bits
     io.cmd.bits.rs1 := o.dram_addr
-    io.cmd.bits.rs2 := (o.I << 48.U) | (o.J << 32.U) | o.spad_addr
+    val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
+    mvin_cmd_rs2 := DontCare
+    mvin_cmd_rs2.num_rows := o.I.asUInt()
+    mvin_cmd_rs2.num_cols := o.J.asUInt()
+    mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
   }
 
   // Sending outputs
@@ -207,7 +222,8 @@ class LoopConvLdInputReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
 }
 
 class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int,
-                      max_block_len: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module {
+                      max_block_len: Int, concurrent_loops: Int, latency: Int,
+                      config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module {
   val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow
 
   val io = IO(new Bundle {
@@ -287,14 +303,22 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   val config_cmd = Wire(new RoCCCommand)
   config_cmd := DontCare
   config_cmd.inst.funct := CONFIG_CMD
-  config_cmd.rs1 := (MVIN_SCALE_IDENTITY << 32.U) | (input_spad_stride << 16.U) | (0.U << 3) | 1.U
+
+  val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType)
+  config_cmd_rs1 := DontCare
+  config_cmd_rs1.scale := MVIN_SCALE_IDENTITY
+  config_cmd_rs1.stride := input_spad_stride
+  config_cmd_rs1.state_id := 0.U
+  config_cmd_rs1.shrink := 0.U
+  config_cmd_rs1._unused := 1.U
+  config_cmd.rs1 := config_cmd_rs1.asUInt()
   config_cmd.rs2 := dram_stride << req.downsample
 
   val mvin_cmd = Wire(new RoCCCommand)
   mvin_cmd := DontCare
   mvin_cmd.inst.funct := LOAD_CMD
-  mvin_cmd.rs1 := 0.U //dram_addr
-  mvin_cmd.rs2 := 0.U // ((I >> req.downsample) << 48.U).asUInt() | (K << 32.U).asUInt() | spad_addr.asUInt()
+  mvin_cmd.rs1 := 0.U // dram_addr
+  mvin_cmd.rs2 := 0.U // mvin_cmd_rs2
 
   // Inputs and outputs
   io.req.ready := state === idle && !command_p.io.busy
@@ -314,7 +338,12 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   when (command_p.io.out.bits.cmd.inst.funct === LOAD_CMD) {
     val o = command_p.io.out.bits
     io.cmd.bits.rs1 := o.dram_addr
-    io.cmd.bits.rs2 := ((o.I >> req.downsample) << 48).asUInt | (o.K << 32).asUInt | o.spad_addr.asUInt
+    val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
+    mvin_cmd_rs2 := DontCare
+    mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt()
+    mvin_cmd_rs2.num_cols := o.K.asUInt()
+    mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
   }
 
   // Sending outputs
@@ -366,7 +395,8 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
 }
 
 class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int,
-                       max_block_len: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module {
+                       max_block_len: Int, concurrent_loops: Int, latency: Int,
+                       config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module {
   val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow
 
   val io = IO(new Bundle {
@@ -444,14 +474,21 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   val config_cmd = Wire(new RoCCCommand)
   config_cmd := DontCare
   config_cmd.inst.funct := CONFIG_CMD
-  config_cmd.rs1 := (MVIN_SCALE_IDENTITY << 32.U) | (req.derived_params.weight_spad_stride << 16.U) | (1.U << 3) | 1.U
+  val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType)
+  config_cmd_rs1 := DontCare
+  config_cmd_rs1.scale := MVIN_SCALE_IDENTITY
+  config_cmd_rs1.stride := req.derived_params.weight_spad_stride
+  config_cmd_rs1.state_id := 1.U
+  config_cmd_rs1.shrink := 0.U
+  config_cmd_rs1._unused := 1.U
+  config_cmd.rs1 := config_cmd_rs1.asUInt
   config_cmd.rs2 := dram_stride
 
   val mvin_cmd = Wire(new RoCCCommand)
   mvin_cmd := DontCare
   mvin_cmd.inst.funct := LOAD2_CMD
-  mvin_cmd.rs1 := 0.U//dram_addr
-  mvin_cmd.rs2 := 0.U//(K << 48.U) | (J << 32.U) | spad_addr
+  mvin_cmd.rs1 := 0.U // dram_addr
+  mvin_cmd.rs2 := 0.U // mvin_cmd_rs2
 
   // Inputs and outputs
   io.req.ready := state === idle && !command_p.io.busy
@@ -471,7 +508,12 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   when (command_p.io.out.bits.cmd.inst.funct === LOAD2_CMD) {
     val o = command_p.io.out.bits
     io.cmd.bits.rs1 := o.dram_addr
-    io.cmd.bits.rs2 := (o.K << 48) | (o.J << 32) | o.spad_addr
+    val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
+    mvin_cmd_rs2 := DontCare
+    mvin_cmd_rs2.num_rows := o.K
+    mvin_cmd_rs2.num_cols := o.J
+    mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
   }
 
   // Sending outputs
@@ -524,7 +566,9 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi
 }
 
 class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int,
-                      max_acc_addr: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module {
+                      max_acc_addr: Int, concurrent_loops: Int, latency: Int,
+                      config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
+                      compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)(implicit p: Parameters) extends Module {
   val GARBAGE_ADDR = (~0.U(32.W)).asUInt()
 
   val io = IO(new Bundle {
@@ -623,16 +667,27 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   val config_cmd = Wire(new RoCCCommand)
   config_cmd := DontCare
   config_cmd.inst.funct := CONFIG_CMD
-  config_cmd.rs1 := ((irows * icols) << 16) | (1.U << 7)
-  config_cmd.rs2 := (orows * ocols) << 48
 
-  val pre_cmd = Wire(new RoCCCommand)
+  val config_cmd_rs1 = Wire(config_ex_rs1_t.cloneType)
+  config_cmd_rs1 := DontCare
+  config_cmd_rs1.a_stride := (irows * icols).asUInt()
+  config_cmd_rs1.set_only_strides := 1.U
+  config_cmd_rs1.cmd_type := 0.U
+
+  val config_cmd_rs2 = Wire(new ConfigExRs2)
+  config_cmd_rs2 := DontCare
+  config_cmd_rs2.c_stride := (orows * ocols).asUInt()
+
+  config_cmd.rs1 := config_cmd_rs1.asUInt()
+  config_cmd.rs2 := config_cmd_rs2.asUInt()
+
+  val pre_cmd = Wire(new RoCCCommand) // preload
   pre_cmd := DontCare
   pre_cmd.inst.funct := PRELOAD_CMD
   pre_cmd.rs1 := 0.U//(K << 48) | (J << 32) | pre_addr
   pre_cmd.rs2 := 0.U//(I << 48) | (J << 32) | c_addr
 
-  val comp_cmd = Wire(new RoCCCommand())
+  val comp_cmd = Wire(new RoCCCommand()) // compute.preloaded
   comp_cmd := DontCare
   comp_cmd.inst.funct := Mux(new_weights, COMPUTE_AND_FLIP_CMD, COMPUTE_AND_STAY_CMD)
   comp_cmd.rs1 := 0.U//(I << 48) | (K << 32) | a_addr
@@ -659,12 +714,36 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   io.cmd.bits := command_p.io.out.bits.cmd
   when (command_p.io.out.bits.cmd.inst.funct === PRELOAD_CMD) {
     val o = command_p.io.out.bits
-    io.cmd.bits.rs1 := (o.K << 48) | (o.J << 32) | o.pre_addr
-    io.cmd.bits.rs2 := (o.I << 48) | (o.J << 32) | o.c_addr
+    val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType)
+    pre_cmd_rs1 := DontCare
+    pre_cmd_rs1.num_rows := o.K.asUInt()
+    pre_cmd_rs1.num_cols := o.J.asUInt()
+    pre_cmd_rs1.local_addr := o.pre_addr.asTypeOf(pre_cmd_rs1.local_addr)
+
+    val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType)
+    pre_cmd_rs2 := DontCare
+    pre_cmd_rs2.num_rows := o.I.asUInt()
+    pre_cmd_rs2.num_cols := o.J.asUInt()
+    pre_cmd_rs2.local_addr := o.c_addr.asTypeOf(pre_cmd_rs2.local_addr)
+
+    io.cmd.bits.rs1 := pre_cmd_rs1.asUInt()
+    io.cmd.bits.rs2 := pre_cmd_rs2.asUInt()
   }.elsewhen(command_p.io.out.bits.cmd.inst.funct =/= CONFIG_CMD) {
     val o = command_p.io.out.bits
-    io.cmd.bits.rs1 := (o.I << 48) | (o.K << 32) | o.a_addr
-    io.cmd.bits.rs2 := (o.I << 48) | (o.J << 32) | GARBAGE_ADDR
+    val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType)
+    comp_cmd_rs1 := DontCare
+    comp_cmd_rs1.num_rows := o.I.asUInt()
+    comp_cmd_rs1.num_cols := o.K.asUInt()
+    comp_cmd_rs1.local_addr := o.a_addr.asTypeOf(comp_cmd_rs1.local_addr)
+
+    val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType)
+    comp_cmd_rs2 := DontCare
+    comp_cmd_rs2.num_rows := o.I.asUInt()
+    comp_cmd_rs2.num_cols := o.J.asUInt()
+    comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr)
+
+    io.cmd.bits.rs1 := comp_cmd_rs1.asUInt()
+    io.cmd.bits.rs2 := comp_cmd_rs2.asUInt()
   }
 
   // Updating "new_weights"
@@ -741,7 +820,7 @@ class LoopConvStReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: Int,
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
-class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module {
+class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, concurrent_loops: Int, latency: Int, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2)(implicit p: Parameters) extends Module {
   val ACC_SCALE_NO_CHANGE = ~(0.U(32.W)) // TODO get this from ISA description somehow
 
   val io = IO(new Bundle {
@@ -809,23 +888,48 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   val mvout_cmd = Wire(new RoCCCommand)
   mvout_cmd := DontCare
   mvout_cmd.inst.funct := STORE_CMD
-  mvout_cmd.rs1 := 0.U//dram_addr
-  mvout_cmd.rs2 := 0.U//(I << 48.U) | (J << 32.U) | spad_addr
+  mvout_cmd.rs1 := 0.U // dram_addr
+  mvout_cmd.rs2 := 0.U // mvout_cmd_rs2
 
   val pre_pool_config_cmd = Wire(new RoCCCommand)
   pre_pool_config_cmd := DontCare
   pre_pool_config_cmd.inst.funct := CONFIG_CMD
-  pre_pool_config_cmd.rs1 := (ocols << 56) | (orows << 48) | (pocols << 40) | (porows << 32) | (pool_out_dim << 24) |
-    (plpad << 10) | (pupad << 8) | (pool_size << 6) | (pool_stride << 4) |
-    (req.activation << 2) | // TODO magic numbers
-    CONFIG_STORE
-  pre_pool_config_cmd.rs2 := (ACC_SCALE_NO_CHANGE << 32) | (out_channels * (input_w / 8).U)
+  val pre_pool_config_cmd_rs1 = Wire(new ConfigMvoutRs1)
+  pre_pool_config_cmd_rs1 := DontCare
+  pre_pool_config_cmd_rs1.ocols := ocols
+  pre_pool_config_cmd_rs1.orows := orows
+  pre_pool_config_cmd_rs1.pocols := pocols
+  pre_pool_config_cmd_rs1.porows := porows
+  pre_pool_config_cmd_rs1.pool_out_dim := pool_out_dim
+  pre_pool_config_cmd_rs1.lpad := plpad
+  pre_pool_config_cmd_rs1.upad := pupad
+  pre_pool_config_cmd_rs1.pool_size := pool_size
+  pre_pool_config_cmd_rs1.pool_stride := pool_stride
+  pre_pool_config_cmd_rs1.activation := req.activation
+  pre_pool_config_cmd_rs1._unused := CONFIG_STORE
+  pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt()
+
+  val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType)
+  pre_pool_config_cmd_rs2 := DontCare
+  pre_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE
+  pre_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U
+  pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt()
 
   val post_pool_config_cmd = Wire(new RoCCCommand)
   post_pool_config_cmd := DontCare
   post_pool_config_cmd.inst.funct := CONFIG_CMD
-  post_pool_config_cmd.rs1 := (req.activation << 2) | CONFIG_STORE // TODO magic numbers
-  post_pool_config_cmd.rs2 := (ACC_SCALE_NO_CHANGE << 32) | (out_channels * (input_w / 8).U)
+
+  val post_pool_config_cmd_rs1 = Wire(new ConfigMvoutRs1)
+  post_pool_config_cmd_rs1 := DontCare
+  post_pool_config_cmd_rs1.activation := req.activation
+  post_pool_config_cmd_rs1._unused := CONFIG_STORE
+  post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt()
+
+  val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType)
+  post_pool_config_cmd_rs2 := DontCare
+  post_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE
+  post_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U
+  post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt()
 
   val pool_cmd = Wire(new RoCCCommand)
   pool_cmd := DontCare
@@ -859,11 +963,22 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   when (command_p.io.out.bits.cmd.inst.funct === STORE_CMD) {
     val o = command_p.io.out.bits
     when (o.is_pool) {
+      val pool_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
+      pool_mvout_cmd_rs2 := DontCare
+      pool_mvout_cmd_rs2.num_cols := o.channels
+      pool_mvout_cmd_rs2.local_addr := o.pool_spad_addr.asTypeOf(pool_mvout_cmd_rs2.local_addr)
+
       io.cmd.bits.rs1 := o.pool_dram_addr
-      io.cmd.bits.rs2 := (o.channels << 32.U) | o.pool_spad_addr
+      io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt()
     } .otherwise {
+      val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
+      mvout_cmd_rs2 := DontCare
+      mvout_cmd_rs2.num_rows := o.I.asUInt()
+      mvout_cmd_rs2.num_cols := o.J.asUInt()
+      mvout_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvout_cmd_rs2.local_addr)
+
       io.cmd.bits.rs1 := o.dram_addr
-      io.cmd.bits.rs2 := (o.I << 48) | (o.J << 32) | o.spad_addr
+      io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt()
     }
   }
 
@@ -1016,7 +1131,10 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s
 }
 
 class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int,
-  max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int)
+  max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
+  config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2,
+  config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
+  compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)
   (implicit p: Parameters) extends Module {
   val large_iterator_bitwidth = 16
   val small_iterator_bitwidth = 16 // 8
@@ -1049,11 +1167,11 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
   // Create inner modules
   val latency = 2
-  val ld_bias = Module(new LoopConvLdBias(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, acc_w, max_block_len_acc, concurrent_loops, latency))
-  val ld_input = Module(new LoopConvLdInput(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency))
-  val ld_weights = Module(new LoopConvLdWeight(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency))
-  val ex = Module(new LoopConvExecute(block_size, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, latency))
-  val st = Module(new LoopConvSt(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, input_w, concurrent_loops, latency))
+  val ld_bias = Module(new LoopConvLdBias(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, acc_w, max_block_len_acc, concurrent_loops, latency, config_mvin_rs1_t, mvin_rs2_t))
+  val ld_input = Module(new LoopConvLdInput(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency, config_mvin_rs1_t, mvin_rs2_t))
+  val ld_weights = Module(new LoopConvLdWeight(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency, config_mvin_rs1_t, mvin_rs2_t))
+  val ex = Module(new LoopConvExecute(block_size, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, latency, config_ex_rs1_t, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t))
+  val st = Module(new LoopConvSt(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, input_w, concurrent_loops, latency, config_mvout_rs2_t, mvout_rs2_t))
 
   // Create command queue
   val cmd = Queue(io.in)
@@ -1339,10 +1457,15 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 object LoopConv {
   def apply(in: DecoupledIO[RoCCCommand], ld_utilization: UInt, st_utilization: UInt, ex_utilization: UInt,
             block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int,
-            max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int)
+            max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
+            config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2,
+            mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
+            compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)
            (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = {
     val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts,
-      max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes))
+      max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes,
+      config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t,
+      compute_rs1_t, compute_rs2_t))
     mod.io.in <> in
     mod.io.ld_utilization := ld_utilization
     mod.io.st_utilization := st_utilization
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index 2502d157..ea1c3ed6 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -23,7 +23,7 @@ class LoopMatmulLdAReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat
 }
 
 class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, input_w: Int,
-                    max_block_len: Int, concurrent_loops: Int)
+                    max_block_len: Int, concurrent_loops: Int, mvin_rs2_t: MvinRs2)
                    (implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val req = Flipped(Decoupled(new LoopMatmulLdAReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, concurrent_loops)))
@@ -70,7 +70,13 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvin_cmd := DontCare
   mvin_cmd.inst.funct := LOAD_CMD
   mvin_cmd.rs1 := dram_addr
-  mvin_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr
+
+  val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
+  mvin_cmd_rs2 := DontCare
+  mvin_cmd_rs2.num_rows := rows.asUInt()
+  mvin_cmd_rs2.num_cols := cols.asUInt()
+  mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
   io.i := i
@@ -121,7 +127,7 @@ class LoopMatmulLdBReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat
 }
 
 class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, input_w: Int,
-                    max_block_len: Int, concurrent_loops: Int)
+                    max_block_len: Int, concurrent_loops: Int, mvin_rs2_t: MvinRs2)
                    (implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val req = Flipped(Decoupled(new LoopMatmulLdBReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, concurrent_loops)))
@@ -171,7 +177,13 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvin_cmd := DontCare
   mvin_cmd.inst.funct := LOAD2_CMD
   mvin_cmd.rs1 := dram_addr
-  mvin_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr
+
+  val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
+  mvin_cmd_rs2 := DontCare
+  mvin_cmd_rs2.num_rows := rows.asUInt()
+  mvin_cmd_rs2.num_cols := cols.asUInt()
+  mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
   io.k := k
@@ -222,7 +234,7 @@ class LoopMatmulLdDReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat
 }
 
 class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int,
-                    acc_w: Int, max_block_len: Int, max_block_len_acc: Int, concurrent_loops: Int)
+                    acc_w: Int, max_block_len: Int, max_block_len_acc: Int, concurrent_loops: Int, mvin_rs2_t: MvinRs2)
                    (implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val req = Flipped(Decoupled(new LoopMatmulLdDReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, concurrent_loops)))
@@ -261,7 +273,13 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvin_cmd := DontCare
   mvin_cmd.inst.funct := LOAD3_CMD
   mvin_cmd.rs1 := dram_addr
-  mvin_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr
+
+  val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
+  mvin_cmd_rs2 := DontCare
+  mvin_cmd_rs2.num_rows := rows.asUInt()
+  mvin_cmd_rs2.num_cols := cols.asUInt()
+  mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
   io.idle := state === idle
@@ -312,7 +330,9 @@ class LoopMatmulExecuteReq(val block_size: Int, val coreMaxAddrBits: Int, val it
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
-class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, max_acc_addr: Int, concurrent_loops: Int)
+class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, max_acc_addr: Int, concurrent_loops: Int,
+                        preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
+                        compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)
                        (implicit p: Parameters) extends Module {
   val GARBAGE_ADDR = (~0.U(32.W)).asUInt()
 
@@ -380,14 +400,40 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
   val pre_cmd = Wire(new RoCCCommand)
   pre_cmd := DontCare
   pre_cmd.inst.funct := PRELOAD_CMD
-  pre_cmd.rs1 := pre_addr | (b_cols << 32).asUInt() | (b_rows << 48).asUInt()
-  pre_cmd.rs2 := out_addr | (c_cols << 32).asUInt() | (c_rows << 48).asUInt()
+
+  val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType)
+  pre_cmd_rs1 := DontCare
+  pre_cmd_rs1.num_rows := b_rows.asUInt()
+  pre_cmd_rs1.num_cols := b_cols.asUInt()
+  pre_cmd_rs1.local_addr := pre_addr.asTypeOf(pre_cmd_rs1.local_addr)
+
+  val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType)
+  pre_cmd_rs2 := DontCare
+  pre_cmd_rs2.num_rows := c_rows.asUInt()
+  pre_cmd_rs2.num_cols := c_cols.asUInt()
+  pre_cmd_rs2.local_addr := out_addr.asTypeOf(pre_cmd_rs2.local_addr)
+
+  pre_cmd.rs1 := pre_cmd_rs1.asUInt()
+  pre_cmd.rs2 := pre_cmd_rs2.asUInt()
 
   val comp_cmd = Wire(new RoCCCommand())
   comp_cmd := DontCare
   comp_cmd.inst.funct := Mux(i === 0.U, COMPUTE_AND_FLIP_CMD, COMPUTE_AND_STAY_CMD)
-  comp_cmd.rs1 := a_addr | (a_cols << 32).asUInt() | (a_rows << 48).asUInt()
-  comp_cmd.rs2 := GARBAGE_ADDR | (block_size.U << 32).asUInt() | (block_size.U << 48).asUInt()
+
+  val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType)
+  comp_cmd_rs1 := DontCare
+  comp_cmd_rs1.num_rows := a_rows.asUInt()
+  comp_cmd_rs1.num_cols := a_cols.asUInt()
+  comp_cmd_rs1.local_addr := a_addr.asTypeOf(comp_cmd_rs1.local_addr)
+
+  val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType)
+  comp_cmd_rs2 := DontCare
+  comp_cmd_rs2.num_rows := block_size.U
+  comp_cmd_rs2.num_cols := block_size.U
+  comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr)
+
+  comp_cmd.rs1 := comp_cmd_rs1.asUInt()
+  comp_cmd.rs2 := comp_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
   io.k := k
@@ -448,7 +494,7 @@ class LoopMatmulStCReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
-class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, max_block_len: Int, concurrent_loops: Int)
+class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, max_block_len: Int, concurrent_loops: Int, mvout_rs2_t: MvoutRs2)
                    (implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val req = Flipped(Decoupled(new LoopMatmulStCReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, concurrent_loops)))
@@ -494,7 +540,13 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvout_cmd := DontCare
   mvout_cmd.inst.funct := STORE_CMD
   mvout_cmd.rs1 := dram_addr
-  mvout_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr
+
+  val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
+  mvout_cmd_rs2 := DontCare
+  mvout_cmd_rs2.num_rows := rows.asUInt()
+  mvout_cmd_rs2.num_cols := cols.asUInt()
+  mvout_cmd_rs2.local_addr := sp_addr.asTypeOf(mvout_cmd_rs2.local_addr)
+  mvout_cmd.rs2 := mvout_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
   io.j := j
@@ -606,7 +658,9 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val
 }
 
 class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int,
-                 max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int)
+                 max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
+                 mvin_rs2_t: MvinRs2, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
+                 compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, mvout_rs2_t: MvoutRs2)
                 (implicit p: Parameters) extends Module {
   val iterator_bitwidth = 16
   val max_block_len = (dma_max_bytes / (block_size * input_w / 8)) max 1
@@ -635,11 +689,11 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
   val loop_being_configured = loops(loop_being_configured_id)
 
   // Create inner modules
-  val ldA = Module(new LoopMatmulLdA(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops))
-  val ldB = Module(new LoopMatmulLdB(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops))
-  val ldD = Module(new LoopMatmulLdD(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, max_block_len_acc, concurrent_loops))
-  val ex = Module(new LoopMatmulExecute(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops))
-  val stC = Module(new LoopMatmulStC(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, concurrent_loops))
+  val ldA = Module(new LoopMatmulLdA(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t))
+  val ldB = Module(new LoopMatmulLdB(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t))
+  val ldD = Module(new LoopMatmulLdD(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, max_block_len_acc, concurrent_loops, mvin_rs2_t))
+  val ex = Module(new LoopMatmulExecute(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t))
+  val stC = Module(new LoopMatmulStC(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, concurrent_loops, mvout_rs2_t))
 
   // Create command queue
   val cmd = Queue(io.in)
@@ -912,10 +966,13 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
 object LoopMatmul {
   def apply(in: DecoupledIO[RoCCCommand], ld_utilization: UInt, st_utilization: UInt, ex_utilization: UInt,
             block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int,
-            max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int)
+            max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
+            mvin_rs2_t: MvinRs2, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
+            compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, mvout_rs2_t: MvoutRs2)
            (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = {
     val mod = Module(new LoopMatmul(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts,
-      max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes))
+      max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes,
+      mvin_rs2_t, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t, mvout_rs2_t))
     mod.io.in <> in
     mod.io.ld_utilization := ld_utilization
     mod.io.st_utilization := st_utilization
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index b0e38b42..50efcfe5 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -41,7 +41,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val block_cols = meshColumns * tileColumns
   val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1
 
-  val activation = Reg(UInt(2.W)) // TODO magic number
+  val activation = Reg(UInt(GemminiISA.CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W))
   val acc_scale = Reg(acc_scale_args.multiplicand_t)
 
   //val row_counter = RegInit(0.U(log2Ceil(block_rows).W))
@@ -49,15 +49,15 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val block_counter = RegInit(0.U(8.W)) // TODO magic number
 
   // Pooling variables
-  val pool_stride = Reg(UInt(2.W)) // When this is 0, pooling is disabled // TODO magic number
-  val pool_size = Reg(UInt(2.W)) // TODO magic number
-  val pool_out_dim = Reg(UInt(8.W)) // TODO magic number
-  val pool_porows = Reg(UInt(8.W)) // TODO magic number
-  val pool_pocols = Reg(UInt(8.W)) // TODO magic number
-  val pool_orows = Reg(UInt(8.W)) // TODO magic number
-  val pool_ocols = Reg(UInt(8.W)) // TODO magic number
-  val pool_upad = Reg(UInt(2.W)) // TODO magic number
-  val pool_lpad = Reg(UInt(2.W)) // TODO magic number
+  val pool_stride = Reg(UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W)) // When this is 0, pooling is disabled
+  val pool_size = Reg(UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W))
+  val pool_out_dim = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH.W))
+  val pool_porows = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH.W))
+  val pool_pocols = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH.W))
+  val pool_orows = Reg(UInt(CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH.W))
+  val pool_ocols = Reg(UInt(CONFIG_MVOUT_RS1_OUT_COLS_WIDTH.W))
+  val pool_upad = Reg(UInt(CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH.W))
+  val pool_lpad = Reg(UInt(CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH.W))
 
   val porow_counter = RegInit(0.U(pool_porows.getWidth.W))
   val pocol_counter = RegInit(0.U(pool_pocols.getWidth.W))
@@ -78,22 +78,26 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   // Commands
   val cmd = Queue(io.cmd, st_queue_length)
   val vaddr = cmd.bits.cmd.rs1
-  val localaddr = cmd.bits.cmd.rs2.asTypeOf(local_addr_t)
-  val cols = cmd.bits.cmd.rs2(32 + mvout_cols_bits - 1, 32) // TODO magic numbers
-  val rows = cmd.bits.cmd.rs2(48 + mvout_rows_bits - 1, 48) // TODO magic numbers
+  val mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t))
+  val localaddr = mvout_rs2.local_addr
+  val cols = mvout_rs2.num_cols
+  val rows = mvout_rs2.num_rows
   val blocks = (cols / block_cols.U) + (cols % block_cols.U =/= 0.U)
-  val config_stride = cmd.bits.cmd.rs2(31, 0) // TODO magic numbers
-  val config_activation = cmd.bits.cmd.rs1(3, 2) // TODO magic numbers
-  val config_acc_scale = cmd.bits.cmd.rs2(63, 32) // TODO magic numbers
-  val config_pool_stride = cmd.bits.cmd.rs1(5, 4) // TODO magic numbers
-  val config_pool_size = cmd.bits.cmd.rs1(7, 6) // TODO magic numbers
-  val config_pool_out_dim = cmd.bits.cmd.rs1(31, 24) // TODO magic numbers
-  val config_porows = cmd.bits.cmd.rs1(39, 32) // TODO magic numbers
-  val config_pocols = cmd.bits.cmd.rs1(47, 40) // TODO magic numbers
-  val config_orows = cmd.bits.cmd.rs1(55, 48) // TODO magic numbers
-  val config_ocols = cmd.bits.cmd.rs1(63, 56) // TODO magic numbers
-  val config_upad = cmd.bits.cmd.rs1(9, 8) // TODO magic numbers
-  val config_lpad = cmd.bits.cmd.rs1(11, 10) // TODO magic numbers
+
+  val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1)
+  val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32))
+  val config_stride = config_mvout_rs2.stride
+  val config_activation = config_mvout_rs1.activation
+  val config_acc_scale = config_mvout_rs2.acc_scale
+  val config_pool_stride = config_mvout_rs1.pool_stride
+  val config_pool_size = config_mvout_rs1.pool_size
+  val config_pool_out_dim = config_mvout_rs1.pool_out_dim
+  val config_porows = config_mvout_rs1.porows
+  val config_pocols = config_mvout_rs1.pocols
+  val config_orows = config_mvout_rs1.orows
+  val config_ocols = config_mvout_rs1.ocols
+  val config_upad = config_mvout_rs1.upad
+  val config_lpad = config_mvout_rs1.lpad
 
   val mstatus = cmd.bits.cmd.status
 

From 03fa3d1928b8e8f7bdba97f15beb91c1383c8f30 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Thu, 11 Nov 2021 20:18:59 -0800
Subject: [PATCH 04/11] Merge IISWC Tutorial Changes (#158)

Added new convenience scripts and config files
---
 .gitignore                                    |   4 +
 scripts/build-midas.sh                        |  19 ++
 scripts/build-onnx-inference.sh               |   7 +
 scripts/build-onnx-training.sh                |   7 +
 scripts/build-spike.sh                        |  12 ++
 scripts/build-vcs.sh                          |   5 +
 scripts/build-verilator.sh                    |   5 +
 scripts/run-midas.sh                          |  43 ++++
 scripts/run-spike.sh                          |  22 ++
 scripts/run-vcs-debug.sh                      |  22 ++
 scripts/run-vcs.sh                            |  23 +++
 scripts/run-verilator-debug.sh                |  23 +++
 scripts/run-verilator.sh                      |  23 +++
 scripts/setup-paths.sh                        |  36 ++++
 software/gemmini-rocc-tests                   |   2 +-
 src/main/scala/gemmini/AccumulatorMem.scala   |  25 ++-
 src/main/scala/gemmini/AccumulatorScale.scala |  41 ++--
 src/main/scala/gemmini/Arithmetic.scala       |  57 +-----
 src/main/scala/gemmini/Configs.scala          |  85 ++++----
 src/main/scala/gemmini/ConfigsFP.scala        |  30 +--
 src/main/scala/gemmini/Controller.scala       | 123 +++++-------
 src/main/scala/gemmini/CounterFile.scala      |  28 ++-
 src/main/scala/gemmini/CustomCPUConfigs.scala |  20 ++
 src/main/scala/gemmini/CustomConfigs.scala    |  60 ++++++
 src/main/scala/gemmini/CustomSoCConfigs.scala |  24 +++
 src/main/scala/gemmini/DMA.scala              |  81 ++++++--
 src/main/scala/gemmini/DSEConfigs.scala       |  27 +--
 .../scala/gemmini/ExecuteController.scala     |  29 ++-
 src/main/scala/gemmini/FrontendTLB.scala      |  20 +-
 src/main/scala/gemmini/GemminiConfigs.scala   | 189 +++++++++++-------
 src/main/scala/gemmini/Im2Col.scala           |   4 +-
 src/main/scala/gemmini/LoadController.scala   |  11 +-
 src/main/scala/gemmini/LoopConv.scala         |  38 ++--
 .../{ROB.scala => ReservationStation.scala}   |  56 ++++--
 src/main/scala/gemmini/Scratchpad.scala       |  39 ++--
 src/main/scala/gemmini/StoreController.scala  |  14 +-
 .../gemmini/TransposePreloadUnroller.scala    |   1 +
 src/main/scala/gemmini/XactTracker.scala      |  25 ++-
 38 files changed, 891 insertions(+), 389 deletions(-)
 create mode 100755 scripts/build-midas.sh
 create mode 100755 scripts/build-onnx-inference.sh
 create mode 100755 scripts/build-onnx-training.sh
 create mode 100755 scripts/build-spike.sh
 create mode 100755 scripts/build-vcs.sh
 create mode 100755 scripts/build-verilator.sh
 create mode 100755 scripts/run-midas.sh
 create mode 100755 scripts/run-spike.sh
 create mode 100755 scripts/run-vcs-debug.sh
 create mode 100755 scripts/run-vcs.sh
 create mode 100755 scripts/run-verilator-debug.sh
 create mode 100755 scripts/run-verilator.sh
 create mode 100755 scripts/setup-paths.sh
 create mode 100644 src/main/scala/gemmini/CustomCPUConfigs.scala
 create mode 100644 src/main/scala/gemmini/CustomConfigs.scala
 create mode 100644 src/main/scala/gemmini/CustomSoCConfigs.scala
 rename src/main/scala/gemmini/{ROB.scala => ReservationStation.scala} (87%)

diff --git a/.gitignore b/.gitignore
index 376625eb..53469249 100644
--- a/.gitignore
+++ b/.gitignore
@@ -338,3 +338,7 @@ project/plugins/project/
 # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 hs_err_pid*
 
+# Gemmini specific
+configs/
+generated-src/
+
diff --git a/scripts/build-midas.sh b/scripts/build-midas.sh
new file mode 100755
index 00000000..cfc2347f
--- /dev/null
+++ b/scripts/build-midas.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+if [ "$1" == "--help" ]; then
+    echo usage: $0 DRAM_CONTROLLER_MODEL
+    echo " " DRAM_CONTROLLER_MODEL: Either DDR3FCFS or DDR3FRFCFS or DDR3FRFCFSLLC4MB
+    echo "   " FCFS is "first come, first serve"
+    echo "   " FRFCFS is "first ready, first come, first serve"
+    exit
+elif [ "$1" == "" ]; then
+    echo DRAM model must be provided
+    exit 1
+fi
+
+cd ../../sims/firesim/
+source sourceme-f1-manager.sh &> build.log
+
+cd sim/
+make verilator TARGET_CONFIG=${1}_WithDefaultFireSimBridges_WithFireSimConfigTweaks_chipyard.CustomGemminiSoCConfig
+
diff --git a/scripts/build-onnx-inference.sh b/scripts/build-onnx-inference.sh
new file mode 100755
index 00000000..23742f5c
--- /dev/null
+++ b/scripts/build-onnx-inference.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/
+rm -rf ./build/
+./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=ON onnxruntime_SYSTOLIC_FP32=OFF
+cd ./systolic_runner/imagenet_runner/
+./build.sh --parallel --enable_training --config=Debug
diff --git a/scripts/build-onnx-training.sh b/scripts/build-onnx-training.sh
new file mode 100755
index 00000000..55c9bc7b
--- /dev/null
+++ b/scripts/build-onnx-training.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/
+rm -rf ./build/
+./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=OFF onnxruntime_SYSTOLIC_FP32=ON
+cd ./systolic_runner/imagenet_trainer/
+./build.sh --enable_training
diff --git a/scripts/build-spike.sh b/scripts/build-spike.sh
new file mode 100755
index 00000000..0b678a0b
--- /dev/null
+++ b/scripts/build-spike.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+export GEMMINI_ONLY_GENERATE_GEMMINI_H=1
+
+cd ../../sims/verilator/
+echo Generating new gemmini_params.h file...
+make verilog CONFIG=CustomGemminiSoCConfig &> build.log
+
+cd -
+cp software/gemmini-rocc-tests/include/gemmini_params.h ../../toolchains/esp-tools/riscv-isa-sim/gemmini/gemmini_params.h
+cd ../../toolchains/esp-tools/riscv-isa-sim/build
+make && make install
diff --git a/scripts/build-vcs.sh b/scripts/build-vcs.sh
new file mode 100755
index 00000000..d18c7e5e
--- /dev/null
+++ b/scripts/build-vcs.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cd ../../sims/vcs/
+make $@ CONFIG=CustomGemminiSoCConfig
+
diff --git a/scripts/build-verilator.sh b/scripts/build-verilator.sh
new file mode 100755
index 00000000..65053fc2
--- /dev/null
+++ b/scripts/build-verilator.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cd ../../sims/verilator/
+make $@ CONFIG=CustomGemminiSoCConfig
+
diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh
new file mode 100755
index 00000000..115125e5
--- /dev/null
+++ b/scripts/run-midas.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+if [ "$1" == "--help" ]; then
+    echo usage: $0 DRAM_CONTROLLER_MODEL binary
+    echo " " DRAM_CONTROLLER_MODEL: Either DDR3FCFS or DDR3FRFCFS or DDR3FRFCFSLLC4MB
+    echo "   " FCFS is "first come, first serve"
+    echo "   " FRFCFS is "first ready, first come, first serve"
+    exit
+elif [ "$1" == "" ]; then
+    echo DRAM model must be provided
+    exit 1
+fi
+
+path=""
+suffix=""
+
+binary="$2"
+
+if [ "$binary" == "" ]; then
+    echo You must provide a binary to run
+elif [ -f "software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then
+    path="$PWD/software/gemmini-rocc-tests/build/bareMetalC/"
+    suffix="-baremetal"
+elif [ -f "software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then
+    path="$PWD/software/gemmini-rocc-tests/build/imagenet/"
+    suffix="-baremetal"
+elif [ ! -f "$binary" ]; then
+    echo Binary not found
+    exit 1
+fi
+
+gemminidir="$PWD"
+
+cd ../../sims/firesim/
+source sourceme-f1-manager.sh &> build.log
+
+cd sim/
+
+cd generated-src/f1/FireSim-${1}_WithDefaultFireSimBridges_WithFireSimConfigTweaks_chipyard.CustomGemminiSoCConfig-BaseF1Config
+
+./VFireSim ${path}${binary}${suffix} \
+    +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +dramsim +max-cycles=100000000 \
+    2>/dev/null
diff --git a/scripts/run-spike.sh b/scripts/run-spike.sh
new file mode 100755
index 00000000..b5343e5d
--- /dev/null
+++ b/scripts/run-spike.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+path=""
+suffix=""
+
+binary="$1"
+
+if [ "$binary" == "" ]; then
+    echo You must provide a binary to run
+elif [ -f "software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then
+    path="software/gemmini-rocc-tests/build/bareMetalC/"
+    suffix="-baremetal"
+elif [ -f "software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then
+    path="software/gemmini-rocc-tests/build/imagenet/"
+    suffix="-baremetal"
+elif [ ! -f "$binary" ]; then
+    echo Binary not found
+    exit 1
+fi
+
+spike --extension=gemmini ${path}${binary}${suffix}
+
diff --git a/scripts/run-vcs-debug.sh b/scripts/run-vcs-debug.sh
new file mode 100755
index 00000000..a0b9b9e1
--- /dev/null
+++ b/scripts/run-vcs-debug.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+path=""
+suffix=""
+
+binary="$1"
+
+if [ "$binary" == "" ]; then
+    echo You must provide a binary to run
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/"
+    suffix="-baremetal"
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/"
+    suffix="-baremetal"
+elif [ ! -f "$binary" ]; then
+    echo Binary not found
+    exit 1
+fi
+
+cd ../../sims/vcs/
+./simv-chipyard-CustomGemminiSoCConfig-debug ${path}${binary}${suffix}
diff --git a/scripts/run-vcs.sh b/scripts/run-vcs.sh
new file mode 100755
index 00000000..ede89561
--- /dev/null
+++ b/scripts/run-vcs.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+path=""
+suffix=""
+
+binary="$1"
+
+if [ "$binary" == "" ]; then
+    echo You must provide a binary to run
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/"
+    suffix="-baremetal"
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/"
+    suffix="-baremetal"
+elif [ ! -f "$binary" ]; then
+    echo Binary not found
+    exit 1
+fi
+
+cd ../../sims/vcs/
+./simv-chipyard-CustomGemminiSoCConfig ${path}${binary}${suffix}
+
diff --git a/scripts/run-verilator-debug.sh b/scripts/run-verilator-debug.sh
new file mode 100755
index 00000000..f856429b
--- /dev/null
+++ b/scripts/run-verilator-debug.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+path=""
+suffix=""
+
+binary="$1"
+
+if [ "$binary" == "" ]; then
+    echo You must provide a binary to run
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/"
+    suffix="-baremetal"
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/"
+    suffix="-baremetal"
+elif [ ! -f "$binary" ]; then
+    echo Binary not found
+    exit 1
+fi
+
+cd ../../sims/verilator/
+./simulator-chipyard-CustomGemminiSoCConfig-debug -v waveform.vcd ${path}${binary}${suffix}
+
diff --git a/scripts/run-verilator.sh b/scripts/run-verilator.sh
new file mode 100755
index 00000000..6e4ede18
--- /dev/null
+++ b/scripts/run-verilator.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+path=""
+suffix=""
+
+binary="$1"
+
+if [ "$binary" == "" ]; then
+    echo You must provide a binary to run
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/"
+    suffix="-baremetal"
+elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then
+    path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/"
+    suffix="-baremetal"
+elif [ ! -f "$binary" ]; then
+    echo Binary not found
+    exit 1
+fi
+
+cd ../../sims/verilator/
+./simulator-chipyard-CustomGemminiSoCConfig ${path}${binary}${suffix}
+
diff --git a/scripts/setup-paths.sh b/scripts/setup-paths.sh
new file mode 100755
index 00000000..a4e5dfd0
--- /dev/null
+++ b/scripts/setup-paths.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+if [ ! -d configs ]; then
+    mkdir configs/
+fi
+
+if [ ! -d generated-src ]; then
+    mkdir generated-src/
+fi
+
+if [ ! -f configs/GemminiDefaultConfigs.scala ]; then
+    ln -s $PWD/src/main/scala/gemmini/Configs.scala configs/GemminiDefaultConfigs.scala
+fi
+
+if [ ! -f configs/GemminiCustomConfigs.scala ]; then
+    ln -s $PWD/src/main/scala/gemmini/CustomConfigs.scala configs/GemminiCustomConfigs.scala
+fi
+
+if [ ! -f configs/CPUConfigs.scala ]; then
+    sed '1,1d; $d' $PWD/src/main/scala/gemmini/CustomCPUConfigs.scala > ../chipyard/src/main/scala/config/GemminiCPUConfigs.scala
+    ln -s $PWD/../chipyard/src/main/scala/config/GemminiCPUConfigs.scala configs/CPUConfigs.scala
+fi
+
+if [ ! -f configs/SoCConfigs.scala ]; then
+    sed '1,1d; $d' $PWD/src/main/scala/gemmini/CustomSoCConfigs.scala > ../chipyard/src/main/scala/config/GemminiSoCConfigs.scala
+    ln -s $PWD/../chipyard/src/main/scala/config/GemminiSoCConfigs.scala configs/SoCConfigs.scala
+fi
+
+if [ ! -f generated-src/verilator ] && [ ! -d generated-src/verilator ]; then
+    ln -s $PWD/../../sims/verilator/generated-src/ generated-src/verilator 2>/dev/null
+fi
+
+if [ ! -f generated-src/vcs ] && [ ! -d generated-src/vcs ]; then
+    ln -s $PWD/../../sims/vcs/generated-src/ generated-src/vcs 2>/dev/null
+fi
+
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index fff5ae7a..3aaa2307 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit fff5ae7a1e3770f7e18b9675784185e9dd9d8d55
+Subproject commit 3aaa230733a9eba6edf4d14243d84595e017522f
diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala
index 4d7d1e46..89a39182 100644
--- a/src/main/scala/gemmini/AccumulatorMem.scala
+++ b/src/main/scala/gemmini/AccumulatorMem.scala
@@ -53,8 +53,8 @@ class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]]
 }
 
 class AccumulatorMem[T <: Data, U <: Data](
-  n: Int, t: Vec[Vec[T]], scale_args: ScaleArguments[T, U],
-  acc_singleported: Boolean, num_acc_sub_banks: Int
+                                            n: Int, t: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U,
+                                            acc_singleported: Boolean, acc_sub_banks: Int
 )
   (implicit ev: Arithmetic[T]) extends Module {
   // TODO Do writes in this module work with matrices of size 2? If we try to read from an address right after writing
@@ -69,7 +69,7 @@ class AccumulatorMem[T <: Data, U <: Data](
   import ev._
 
   // TODO unify this with TwoPortSyncMemIO
-  val io = IO(new AccumulatorMemIO(n, t, scale_args.multiplicand_t))
+  val io = IO(new AccumulatorMemIO(n, t, scale_t))
 
 
   // For any write operation, we spend 2 cycles reading the existing address out, buffering it in a register, and then
@@ -109,10 +109,10 @@ class AccumulatorMem[T <: Data, U <: Data](
     reads(1).bits  := io.read.req.bits.addr
     reads(1).ready := true.B
     block_read_req := !reads(1).ready
-    for (i <- 0 until num_acc_sub_banks) {
-      def isThisBank(addr: UInt) = addr(log2Ceil(num_acc_sub_banks)-1,0) === i.U
-      def getBankIdx(addr: UInt) = addr >> log2Ceil(num_acc_sub_banks)
-      val mem = SyncReadMem(n / num_acc_sub_banks, Vec(mask_len, mask_elem))
+    for (i <- 0 until acc_sub_banks) {
+      def isThisBank(addr: UInt) = addr(log2Ceil(acc_sub_banks)-1,0) === i.U
+      def getBankIdx(addr: UInt): UInt = (addr >> log2Ceil(acc_sub_banks)).asUInt()
+      val mem = SyncReadMem(n / acc_sub_banks, Vec(mask_len, mask_elem))
 
       val ren = WireInit(false.B)
       val raddr = WireInit(getBankIdx(reads(0).bits))
@@ -123,7 +123,7 @@ class AccumulatorMem[T <: Data, U <: Data](
         val valid = Bool()
         val data = Vec(mask_len, mask_elem)
         val mask = Vec(mask_len, Bool())
-        val addr = UInt(log2Ceil(n/num_acc_sub_banks).W)
+        val addr = UInt(log2Ceil(n/acc_sub_banks).W)
         override def cloneType: this.type = new W_Q_Entry(mask_len, mask_elem).asInstanceOf[this.type]
       }
       val w_q = Reg(Vec(nEntries, new W_Q_Entry(mask_len, mask_elem)))
@@ -134,6 +134,7 @@ class AccumulatorMem[T <: Data, U <: Data](
             isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr &&
             ((io.write.bits.mask.asUInt & e.mask.asUInt) =/= 0.U)
           ))
+
           when (io.read.req.valid && isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) {
             reads(1).ready := false.B
           }
@@ -149,7 +150,7 @@ class AccumulatorMem[T <: Data, U <: Data](
       val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask))
       val waddr = Mux1H(w_q_head.asBools, w_q.map(_.addr))
       when (wen) {
-        w_q_head := w_q_head << 1 | w_q_head(nEntries-1)
+        w_q_head := (w_q_head << 1).asUInt() | w_q_head(nEntries-1)
         for (i <- 0 until nEntries) {
           when (w_q_head(i)) {
             w_q(i).valid := false.B
@@ -159,7 +160,7 @@ class AccumulatorMem[T <: Data, U <: Data](
 
       when (w_buf_valid && isThisBank(waddr_buf)) {
         assert(!((w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)))
-        w_q_tail := w_q_tail << 1 | w_q_tail(nEntries-1)
+        w_q_tail := (w_q_tail << 1).asUInt() | w_q_tail(nEntries-1)
         for (i <- 0 until nEntries) {
           when (w_q_tail(i)) {
             w_q(i).valid := true.B
@@ -198,7 +199,7 @@ class AccumulatorMem[T <: Data, U <: Data](
     }
   }
 
-  val q = Module(new Queue(new AccumulatorReadResp(t, scale_args.multiplicand_t, log2Ceil(t.head.head.getWidth)),  1, true, true))
+  val q = Module(new Queue(new AccumulatorReadResp(t, scale_t, log2Ceil(t.head.head.getWidth)),  1, true, true))
   q.io.enq.bits.data := read_rdata
   q.io.enq.bits.scale := RegNext(io.read.req.bits.scale)
   q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift)
@@ -228,8 +229,6 @@ class AccumulatorMem[T <: Data, U <: Data](
       !block_read_req
   )
 
-  // io.write.current_waddr.valid := mem.io.wen
-  // io.write.current_waddr.bits := mem.io.waddr
   io.write.ready := !io.write.bits.acc || (!(io.write.bits.addr === waddr_buf && w_buf_valid) &&
     !(io.write.bits.addr === RegNext(io.write.bits.addr) && RegNext(io.write.fire())))
 
diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala
index 2069bc66..5e4997f8 100644
--- a/src/main/scala/gemmini/AccumulatorScale.scala
+++ b/src/main/scala/gemmini/AccumulatorScale.scala
@@ -30,7 +30,7 @@ class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data](
     shift_width, rDataType).asInstanceOf[this.type]
 }
 
-class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U, scale_args: ScaleArguments[T, U]) extends Bundle {
+class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extends Bundle {
   val shift_width = log2Ceil(t.getWidth)
 
   val scale = u.cloneType
@@ -40,24 +40,23 @@ class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U, scale_
   val full_data = t.cloneType
   val id = UInt(2.W) // TODO hardcoded
   val index = UInt()
-  override def cloneType: this.type = new AccScaleDataWithIndex(t, u, scale_args: ScaleArguments[T, U]).asInstanceOf[this.type]
+  override def cloneType: this.type = new AccScaleDataWithIndex(t, u).asInstanceOf[this.type]
 }
 
-class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_args: ScaleArguments[T, U])(implicit ev: Arithmetic[T]) extends Module {
-  val u = scale_args.multiplicand_t
+class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, latency: Int, has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module {
+  val u = scale_t
   val io = IO(new Bundle {
-    val in = Input(Valid(new AccScaleDataWithIndex(t, u, scale_args)(ev)))
-    val out = Output(Valid(new AccScaleDataWithIndex(t, u, scale_args)(ev)))
+    val in = Input(Valid(new AccScaleDataWithIndex(t, u)(ev)))
+    val out = Output(Valid(new AccScaleDataWithIndex(t, u)(ev)))
   })
   import ev._
-  val latency = scale_args.latency
   val out = WireInit(io.in)
 
-  val e_scaled = scale_args.scale_func(io.in.bits.data, io.in.bits.scale)
+  val e_scaled = scale_func(io.in.bits.data, io.in.bits.scale)
   val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head)
   val e_act = MuxCase(e_clipped, Seq(
-    (io.in.bits.act === Activation.RELU) -> e_clipped.relu,
-    (io.in.bits.act === Activation.RELU6) -> e_clipped.relu6(io.in.bits.relu6_shift)))
+    (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e_clipped.relu,
+    (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU6) -> e_clipped.relu6(io.in.bits.relu6_shift)))
 
   out.bits.data := e_act
   io.out := Pipe(out, latency)
@@ -68,9 +67,13 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
   fullDataType: Vec[Vec[T]], rDataType: Vec[Vec[T]],
   scale_t: U, shift_width: Int,
   read_small_data: Boolean, read_full_data: Boolean,
-  scale_args: ScaleArguments[T, U])(implicit ev: Arithmetic[T]) extends Module {
+  scale_func: (T, U) => T,
+  num_scale_units: Int,
+  latency: Int,
+  has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module {
 
   import ev._
+
   val io = IO(new AccumulatorScaleIO[T,U](
     fullDataType, scale_t, shift_width, rDataType
   )(ev))
@@ -78,9 +81,6 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
   val out = Wire(Decoupled(new AccumulatorScaleResp[T](
     fullDataType, rDataType)(ev)))
 
-  val num_scale_units = scale_args.num_scale_units
-  val acc_scale_latency = scale_args.latency
-
   if (num_scale_units == -1) {
     val in = Wire(Decoupled(new AccumulatorReadRespWithFullData(fullDataType, scale_t, shift_width)(ev)))
     in.valid := io.in.valid
@@ -88,11 +88,10 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
     in.bits.resp := io.in.bits
     in.bits.full_data := io.in.bits.data
 
-    val pipe_out = Pipeline(in, acc_scale_latency, Seq.fill(acc_scale_latency)((x: AccumulatorReadRespWithFullData[T,U]) => x) :+ {
+    val pipe_out = Pipeline(in, latency, Seq.fill(latency)((x: AccumulatorReadRespWithFullData[T,U]) => x) :+ {
       x: AccumulatorReadRespWithFullData[T,U] =>
       val activated_rdata = VecInit(x.resp.data.map(v => VecInit(v.map { e =>
-        // val e_scaled = e >> x.shiftls
-        val e_scaled = scale_args.scale_func(e, x.resp.scale)
+        val e_scaled = scale_func(e, x.resp.scale)
         val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head)
         val e_act = MuxCase(e_clipped, Seq(
           (x.resp.act === Activation.RELU) -> e_clipped.relu,
@@ -148,7 +147,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
       tail_oh := (tail_oh << 1) | tail_oh(nEntries-1)
     }
 
-    val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t, scale_args)(ev))) }
+    val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) }
 
     for (i <- 0 until nEntries) {
       for (w <- 0 until width) {
@@ -168,16 +167,16 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
     }
     for (i <- 0 until num_scale_units) {
       val arbIn = inputs.zipWithIndex.filter({ case (_, w) => w % num_scale_units == i }).map(_._1)
-      val arb = Module(new RRArbiter(new AccScaleDataWithIndex(t, scale_t, scale_args)(ev), arbIn.length))
+      val arb = Module(new RRArbiter(new AccScaleDataWithIndex(t, scale_t)(ev), arbIn.length))
       arb.io.in <> arbIn
       arb.io.out.ready := true.B
-      val arbOut = Reg(Valid(new AccScaleDataWithIndex(t, scale_t, scale_args)(ev)))
+      val arbOut = Reg(Valid(new AccScaleDataWithIndex(t, scale_t)(ev)))
       arbOut.valid := arb.io.out.valid
       arbOut.bits  := arb.io.out.bits
       when (reset.asBool) {
         arbOut.valid := false.B
       }
-      val pipe = Module(new AccScalePipe(t, rDataType, scale_args)(ev, ev))
+      val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations)(ev, ev))
       pipe.io.in := arbOut
       val pipe_out = pipe.io.out
 
diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala
index 9170b834..a6684ec7 100644
--- a/src/main/scala/gemmini/Arithmetic.scala
+++ b/src/main/scala/gemmini/Arithmetic.scala
@@ -7,6 +7,14 @@ import chisel3._
 import chisel3.util._
 import hardfloat._
 
+// Bundles that represent the raw bits of custom datatypes
+case class Float(expWidth: Int, sigWidth: Int) extends Bundle {
+  val bits = UInt((expWidth + sigWidth).W)
+
+  val bias: Int = (1 << (expWidth-1)) - 1
+}
+
+// The Arithmetic typeclass which implements various arithmetic operations on custom datatypes
 abstract class Arithmetic[T <: Data] {
   implicit def cast(t: T): ArithmeticOps[T]
 }
@@ -248,30 +256,6 @@ object Arithmetic {
         val result = Wire(Float(self.expWidth, self.sigWidth))
         result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)
         result
-
-        /*
-        val raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits)
-
-        val shifted_raw = WireInit(raw)
-
-        when (!raw.isZero) {
-          shifted_raw.sExp := raw.sExp - u.asSInt()
-        }
-
-        val raw_to_rec_fn_converter = Module(new RoundRawFNToRecFN(self.expWidth, self.sigWidth, options = 0)) // TODO add correct options here so that efficiency may be improved
-
-        raw_to_rec_fn_converter.io.invalidExc := false.B
-        raw_to_rec_fn_converter.io.infiniteExc := false.B
-
-        raw_to_rec_fn_converter.io.in := shifted_raw
-
-        raw_to_rec_fn_converter.io.roundingMode := consts.round_near_maxMag
-        raw_to_rec_fn_converter.io.detectTininess := consts.tininess_afterRounding
-
-        val result = Wire(Float(self.expWidth, self.sigWidth))
-        result.bits := fNFromRecFN(self.expWidth, self.sigWidth, raw_to_rec_fn_converter.io.out)
-        result
-        */
       }
 
       override def >(t: Float): Bool = {
@@ -357,25 +341,6 @@ object Arithmetic {
 
         val shifted_rec = muladder.io.out
 
-        /*
-        val six_raw = rawFloatFromIN(signedIn = false.B, in = 6.U(3.W))
-
-        val shifted_raw = WireInit(six_raw)
-
-        when (!six_raw.isZero) {
-          shifted_raw.sExp := six_raw.sExp + shift.asSInt()
-        }
-
-        val raw_to_rec_fn_converter = Module(new RoundRawFNToRecFN(self.expWidth, self.sigWidth, options = 0)) // TODO add correct options here so that efficiency may be improved
-        raw_to_rec_fn_converter.io.in := shifted_raw
-        raw_to_rec_fn_converter.io.roundingMode := consts.round_near_maxMag
-        raw_to_rec_fn_converter.io.detectTininess := consts.tininess_afterRounding
-        raw_to_rec_fn_converter.io.invalidExc := false.B
-        raw_to_rec_fn_converter.io.infiniteExc := false.B
-
-        val shifted_rec = raw_to_rec_fn_converter.io.out
-        */
-
         // Now, compare self and 6*(2^shift) to calculate the activation function
         val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
         val self_raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits)
@@ -400,9 +365,3 @@ object Arithmetic {
     }
   }
 }
-
-case class Float(expWidth: Int, sigWidth: Int) extends Bundle {
-  val bits = UInt((expWidth + sigWidth).W)
-
-  val bias: Int = (1 << (expWidth-1)) - 1
-}
diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 60f5f7b4..2e172adf 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -15,44 +15,55 @@ import hardfloat._
 
 object GemminiConfigs {
   val defaultConfig = GemminiArrayConfig[SInt, Float, Float](
-    opcodes = OpcodeSet.custom3,
+    // Datatypes
+    inputType = SInt(8.W),
+    accType = SInt(32.W),
+
+    spatialArrayOutputType = SInt(20.W),
 
+    // Spatial array size options
     tileRows = 1,
     tileColumns = 1,
     meshRows = 16,
     meshColumns = 16,
 
+    // Spatial array PE options
+    dataflow = Dataflow.BOTH,
+
+    // Scratchpad and accumulator
+    sp_capacity = CapacityInKilobytes(256),
+    acc_capacity = CapacityInKilobytes(64),
+
+    sp_banks = 4,
+    acc_banks = 2,
+
+    sp_singleported = true,
+    acc_singleported = false,
+
+    // DNN options
+    has_training_convs = true,
+    has_max_pool = true,
+    has_nonlinear_activations = true,
+
+    // Reservation station entries
+    reservation_station_full_entries = 16,
+    reservation_station_partial_entries = 8,
+
+    // Ld/Ex/St instruction queue lengths
     ld_queue_length = 8,
     st_queue_length = 2,
     ex_queue_length = 8,
 
-    rob_full_entries = 16,
-    rob_partial_entries = 8,
+    // DMA options
+    max_in_flight_mem_reqs = 16,
 
-    hasIm2col = false, //declare im2col block
+    dma_maxbytes = 64,
+    dma_buswidth = 128,
 
-    sp_banks = 4,
-    sp_singleported = true,
-    acc_banks = 2,
-    acc_singleported = false,
-    num_acc_sub_banks = -1,
-    sp_capacity = CapacityInKilobytes(256),
-    shifter_banks = 1, // TODO add separate parameters for left and up shifter banks
-    dataflow = Dataflow.BOTH,
-    acc_capacity = CapacityInKilobytes(64),
-    mem_pipeline = 4,
-    dma_maxbytes = 64, // TODO get this from cacheblockbytes
-    dma_buswidth = 128, // TODO get this from SystemBusKey
-    aligned_to = 1,
+    // TLB options
     tlb_size = 4,
-    use_tlb_register_filter = true,
-    max_in_flight_reqs = 16,
-    use_dedicated_tl_port = false,
-
-    inputType = SInt(8.W),
-    outputType = SInt(20.W),
-    accType = SInt(32.W),
 
+    // Mvin and Accumulator scalar multiply options
     mvin_scale_args = Some(ScaleArguments(
       (t: SInt, f: Float) => {
         val f_rec = recFNFromFN(f.expWidth, f.sigWidth, f.bits)
@@ -91,10 +102,11 @@ object GemminiConfigs {
       identity = "1.0",
       c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (elem_t)y);})"
     )),
+
     mvin_scale_acc_args = None,
     mvin_scale_shared = false,
 
-    acc_scale_args = ScaleArguments(
+    acc_scale_args = Some(ScaleArguments(
       (t: SInt, f: Float) => {
         val f_rec = recFNFromFN(f.expWidth, f.sigWidth, f.bits)
 
@@ -128,40 +140,37 @@ object GemminiConfigs {
 
         Mux(overflow, sat, rec_fn_to_in.io.out.asTypeOf(t))
       },
-      1, Float(8, 24), -1, // TODO pipelining should be 5
+      1, Float(8, 24), -1,
       identity = "1.0",
       c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (acc_t)y);})"
-    ),
+    )),
+
+    // SoC counters options
+    num_counter = 8,
 
+    // Scratchpad and Accumulator input/output options
     acc_read_full_width = true,
     acc_read_small_width = true,
 
-    pe_latency = 0,
-
     ex_read_from_spad = true,
     ex_read_from_acc = true,
     ex_write_to_spad = true,
     ex_write_to_acc = true,
-
-    hardcode_d_to_garbage_addr = false,
-
-    mesh_output_delay = 1,
-
-    num_counter = 8,
   )
 
   val chipConfig = defaultConfig.copy(sp_capacity=CapacityInKilobytes(64), acc_capacity=CapacityInKilobytes(32), dataflow=Dataflow.WS,
-    acc_scale_args=defaultConfig.acc_scale_args.copy(latency=4),
+    acc_scale_args=Some(defaultConfig.acc_scale_args.get.copy(latency=4)),
     acc_singleported=true,
-    num_acc_sub_banks=2,
+    acc_sub_banks=2,
     ex_read_from_acc=false,
     ex_write_to_spad=false
   )
+
   val largeChipConfig = chipConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64),
     meshRows=32, meshColumns=32
   )
 
-  val leanConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true)
+  val leanConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true)
 }
 
 /**
diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
index 111041fd..a54c2853 100644
--- a/src/main/scala/gemmini/ConfigsFP.scala
+++ b/src/main/scala/gemmini/ConfigsFP.scala
@@ -24,41 +24,39 @@ object GemminiFPConfigs {
     st_queue_length = 2,
     ex_queue_length = 8,
 
-    rob_full_entries = 16,
-    rob_partial_entries = 8,
-
-    hasIm2col = false,
+    reservation_station_full_entries = 16,
+    reservation_station_partial_entries = 8,
 
     sp_banks = 4,
     sp_singleported = true,
     acc_banks = 1,
     acc_singleported = false,
-    num_acc_sub_banks = -1,
+    acc_sub_banks = -1,
     sp_capacity = CapacityInKilobytes(256),
     shifter_banks = 1, // TODO add separate parameters for left and up shifter banks
     dataflow = Dataflow.BOTH,
     acc_capacity = CapacityInKilobytes(64),
-    mem_pipeline = 1,
+    spad_read_delay = 1,
 
     dma_maxbytes = 64, // TODO get this from cacheblockbytes
     dma_buswidth = 128, // TODO get this from SystemBusKey
     aligned_to = 1,
     tlb_size = 4,
     use_tlb_register_filter = true,
-    max_in_flight_reqs = 16,
+    max_in_flight_mem_reqs = 16,
     use_dedicated_tl_port = false,
 
     inputType = Float(8, 24),
-    outputType = Float(8, 24),
+    spatialArrayOutputType = Float(8, 24),
     accType = Float(8, 24),
 
     mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
     mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
     mvin_scale_shared = false,
 
-    acc_scale_args = ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0",
+    acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0",
       c_str = "((x) * (scale))"
-    ),
+    )),
     acc_read_full_width = true,
     acc_read_small_width = true,
 
@@ -73,32 +71,36 @@ object GemminiFPConfigs {
 
     mesh_output_delay = 0,
 
+    has_training_convs = false,
+    has_max_pool = true,
+    has_nonlinear_activations = true,
+
     num_counter = 8,
   )
   
   //FP32 Single Precision Configuration
-  val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), outputType = Float(8, 24), accType = Float(8, 24),
+  val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24),
                                                pe_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
   
   //FP16 Half Precision Configuration
-  val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), outputType = Float(5, 11), accType = Float(8, 24),
+  val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24),
                                                pe_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
   
   //Bfloat16 Brain-half Precision Configuration
-  val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), outputType = Float(8, 8), accType = Float(8, 24),
+  val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24),
                                                pe_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
 
   //Bfloat16 Brain-half Precision Configuration 8x8 array
-  val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), outputType = Float(8, 8), accType = Float(8, 24),
+  val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24),
                                                meshRows = 8, meshColumns = 8,
                                                pe_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index e63d7451..08481a5c 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -27,6 +27,9 @@ class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiA
     nPTWPorts = 1) {
 
   Files.write(Paths.get(config.headerFilePath), config.generateHeader().getBytes(StandardCharsets.UTF_8))
+  if (System.getenv("GEMMINI_ONLY_GENERATE_GEMMINI_H") == "1") {
+    System.exit(1)
+  }
 
   val xLen = p(XLen)
   val spad = LazyModule(new Scratchpad(config))
@@ -59,19 +62,13 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   // TLB
   implicit val edge = outer.node.edges.out.head
-  val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes))
+  val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters))
   (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2)
   tlb.io.exp.flush_skip := false.B
   tlb.io.exp.flush_retry := false.B
   counters.io.event_io.collect(tlb.io.counter)
 
   io.ptw.head <> tlb.io.ptw
-  /*io.ptw.head.req <> tlb.io.ptw.req
-  tlb.io.ptw.resp <> io.ptw.head.resp
-  tlb.io.ptw.ptbr := io.ptw.head.ptbr
-  tlb.io.ptw.status := outer.spad.module.io.mstatus
-  tlb.io.ptw.pmp := io.ptw.head.pmp
-  tlb.io.ptw.customCSRs := io.ptw.head.customCSRs*/
 
   spad.module.io.flush := tlb.io.exp.flush()
 
@@ -114,32 +111,28 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   val unrolled_cmd = LoopUnroller(raw_risc_cmd, outer.config.meshRows * outer.config.tileRows)
   */
 
-  // Incoming commands and ROB
-  val rob = Module(new ROB(outer.config, new RoCCCommand))
-  counters.io.event_io.collect(rob.io.counter)
+  // Incoming commands and reservation station
+  val reservation_station = Module(new ReservationStation(outer.config, new RoCCCommand))
+  counters.io.event_io.collect(reservation_station.io.counter)
 
   val raw_cmd = Queue(io.cmd)
 
-  val max_lds = rob_partial_entries
-  val max_exs = rob_full_entries
-  val max_sts = rob_partial_entries / 2
+  val max_lds = reservation_station_partial_entries
+  val max_exs = reservation_station_full_entries
+  val max_sts = reservation_station_partial_entries / 2
 
   // TODO replace 4,12,2 with parameters based on ROB size
-  val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
+  val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
     inputType.getWidth, accType.getWidth, dma_maxbytes,
     new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new ConfigMvoutRs2(acc_scale_t_bits, 32), new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t),
     new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
-    new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t))
-
-  // val (compressed_cmd, compressor_busy) = InstCompressor(unrolled_cmd)
-  // compressed_cmd.ready := false.B
-
-  // val (unrolled_cmd, loop_matmul_unroller_busy) = LoopMatmul(unrolled_cmd_after_conv, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
+    new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
+    has_training_convs, has_max_pool)
 
-  val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization,
+  val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
     inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
@@ -150,19 +143,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   unrolled_cmd.ready := false.B
   counters.io.event_io.connectEventSignal(CounterEvent.LOOP_MATMUL_ACTIVE_CYCLES, loop_matmul_unroller_busy)
 
-  // val cmd_decompressor = Module(new InstDecompressor(rob_entries))
-
-  // cmd_decompressor.io.in.valid := rob.io.issue.ex.valid
-  // cmd_decompressor.io.in.bits.cmd := rob.io.issue.ex.cmd
-  // cmd_decompressor.io.in.bits.rob_id := rob.io.issue.ex.rob_id
-  // rob.io.issue.ex.ready := cmd_decompressor.io.in.ready
-
-  // val decompressed_cmd = cmd_decompressor.io.out
-
   // Wire up controllers to ROB
-  rob.io.alloc.valid := false.B
-  // rob.io.alloc.bits := compressed_cmd.bits
-  rob.io.alloc.bits := unrolled_cmd.bits
+  reservation_station.io.alloc.valid := false.B
+  reservation_station.io.alloc.bits := unrolled_cmd.bits
 
   /*
   //-------------------------------------------------------------------------
@@ -196,9 +179,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   tiler.io.issue.exec.ready := false.B
   */
 
-  rob.io.issue.ld.ready := false.B
-  rob.io.issue.st.ready := false.B
-  rob.io.issue.ex.ready := false.B
+  reservation_station.io.issue.ld.ready := false.B
+  reservation_station.io.issue.st.ready := false.B
+  reservation_station.io.issue.ex.ready := false.B
 
   /*
   when (is_cisc_mode) {
@@ -227,23 +210,23 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   }
   */
 
-  load_controller.io.cmd.valid := rob.io.issue.ld.valid
-  rob.io.issue.ld.ready := load_controller.io.cmd.ready
-  load_controller.io.cmd.bits.cmd := rob.io.issue.ld.cmd
-  load_controller.io.cmd.bits.cmd.inst.funct := rob.io.issue.ld.cmd.inst.funct
-  load_controller.io.cmd.bits.rob_id.push(rob.io.issue.ld.rob_id)
+  load_controller.io.cmd.valid := reservation_station.io.issue.ld.valid
+  reservation_station.io.issue.ld.ready := load_controller.io.cmd.ready
+  load_controller.io.cmd.bits.cmd := reservation_station.io.issue.ld.cmd
+  load_controller.io.cmd.bits.cmd.inst.funct := reservation_station.io.issue.ld.cmd.inst.funct
+  load_controller.io.cmd.bits.rob_id.push(reservation_station.io.issue.ld.rob_id)
 
-  store_controller.io.cmd.valid := rob.io.issue.st.valid
-  rob.io.issue.st.ready := store_controller.io.cmd.ready
-  store_controller.io.cmd.bits.cmd := rob.io.issue.st.cmd
-  store_controller.io.cmd.bits.cmd.inst.funct := rob.io.issue.st.cmd.inst.funct
-  store_controller.io.cmd.bits.rob_id.push(rob.io.issue.st.rob_id)
+  store_controller.io.cmd.valid := reservation_station.io.issue.st.valid
+  reservation_station.io.issue.st.ready := store_controller.io.cmd.ready
+  store_controller.io.cmd.bits.cmd := reservation_station.io.issue.st.cmd
+  store_controller.io.cmd.bits.cmd.inst.funct := reservation_station.io.issue.st.cmd.inst.funct
+  store_controller.io.cmd.bits.rob_id.push(reservation_station.io.issue.st.rob_id)
 
-  ex_controller.io.cmd.valid := rob.io.issue.ex.valid
-  rob.io.issue.ex.ready := ex_controller.io.cmd.ready
-  ex_controller.io.cmd.bits.cmd := rob.io.issue.ex.cmd
-  ex_controller.io.cmd.bits.cmd.inst.funct := rob.io.issue.ex.cmd.inst.funct
-  ex_controller.io.cmd.bits.rob_id.push(rob.io.issue.ex.rob_id)
+  ex_controller.io.cmd.valid := reservation_station.io.issue.ex.valid
+  reservation_station.io.issue.ex.ready := ex_controller.io.cmd.ready
+  ex_controller.io.cmd.bits.cmd := reservation_station.io.issue.ex.cmd
+  ex_controller.io.cmd.bits.cmd.inst.funct := reservation_station.io.issue.ex.cmd.inst.funct
+  ex_controller.io.cmd.bits.rob_id.push(reservation_station.io.issue.ex.rob_id)
 
   // Wire up scratchpad to controllers
   spad.module.io.dma.read <> load_controller.io.dma
@@ -284,9 +267,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   }
 
   // Wire up controllers to ROB
-  rob.io.alloc.valid := false.B
+  reservation_station.io.alloc.valid := false.B
   // rob.io.alloc.bits := compressed_cmd.bits
-  rob.io.alloc.bits := unrolled_cmd.bits
+  reservation_station.io.alloc.bits := unrolled_cmd.bits
 
   /*
   //=========================================================================
@@ -309,28 +292,28 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   //-------------------------------------------------------------------------
   // risc
-  val rob_completed_arb = Module(new Arbiter(UInt(log2Up(rob_entries).W), 3))
+  val reservation_station_completed_arb = Module(new Arbiter(UInt(log2Up(rob_entries).W), 3))
 
-  rob_completed_arb.io.in(0).valid := ex_controller.io.completed.valid
-  rob_completed_arb.io.in(0).bits := ex_controller.io.completed.bits
+  reservation_station_completed_arb.io.in(0).valid := ex_controller.io.completed.valid
+  reservation_station_completed_arb.io.in(0).bits := ex_controller.io.completed.bits
 
-  rob_completed_arb.io.in(1) <> load_controller.io.completed
-  rob_completed_arb.io.in(2) <> store_controller.io.completed
+  reservation_station_completed_arb.io.in(1) <> load_controller.io.completed
+  reservation_station_completed_arb.io.in(2) <> store_controller.io.completed
 
   // mux with cisc frontend arbiter
-  rob_completed_arb.io.in(0).valid := ex_controller.io.completed.valid // && !is_cisc_mode
-  rob_completed_arb.io.in(1).valid := load_controller.io.completed.valid // && !is_cisc_mode
-  rob_completed_arb.io.in(2).valid := store_controller.io.completed.valid // && !is_cisc_mode
+  reservation_station_completed_arb.io.in(0).valid := ex_controller.io.completed.valid // && !is_cisc_mode
+  reservation_station_completed_arb.io.in(1).valid := load_controller.io.completed.valid // && !is_cisc_mode
+  reservation_station_completed_arb.io.in(2).valid := store_controller.io.completed.valid // && !is_cisc_mode
 
-  rob.io.completed.valid := rob_completed_arb.io.out.valid
-  rob.io.completed.bits := rob_completed_arb.io.out.bits
-  rob_completed_arb.io.out.ready := true.B
+  reservation_station.io.completed.valid := reservation_station_completed_arb.io.out.valid
+  reservation_station.io.completed.bits := reservation_station_completed_arb.io.out.bits
+  reservation_station_completed_arb.io.out.ready := true.B
 
   // Wire up global RoCC signals
-  io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || rob.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid
+  io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || reservation_station.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid
   io.interrupt := tlb.io.exp.interrupt
 
-  rob.io.solitary_preload := ex_controller.io.solitary_preload
+  reservation_station.io.solitary_preload := ex_controller.io.solitary_preload
 
   // assert(!io.interrupt, "Interrupt handlers have not been written yet")
 
@@ -344,7 +327,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   val incr_st_ex_cycles = !load_controller.io.busy && store_controller.io.busy && ex_controller.io.busy
 
   val incr_ld_st_ex_cycles = load_controller.io.busy && store_controller.io.busy && ex_controller.io.busy
-  
+
   counters.io.event_io.connectEventSignal(CounterEvent.MAIN_LD_CYCLES, incr_ld_cycles)
   counters.io.event_io.connectEventSignal(CounterEvent.MAIN_ST_CYCLES, incr_st_cycles)
   counters.io.event_io.connectEventSignal(CounterEvent.MAIN_EX_CYCLES, incr_ex_cycles)
@@ -372,12 +355,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
     */
 
     when (is_flush) {
-      // val skip = compressed_cmd.bits.rs1(0)
       val skip = unrolled_cmd.bits.rs1(0)
       tlb.io.exp.flush_skip := skip
       tlb.io.exp.flush_retry := !skip
 
-      // compressed_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB?
       unrolled_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB?
     }
 
@@ -387,9 +368,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
     }
 
     .otherwise {
-      rob.io.alloc.valid := true.B
+      reservation_station.io.alloc.valid := true.B
 
-      when(rob.io.alloc.fire()) {
+      when(reservation_station.io.alloc.fire()) {
         // compressed_cmd.ready := true.B
         unrolled_cmd.ready := true.B
       }
diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala
index 9b75acf2..9f0482f7 100644
--- a/src/main/scala/gemmini/CounterFile.scala
+++ b/src/main/scala/gemmini/CounterFile.scala
@@ -64,8 +64,8 @@ object CounterEvent {
   val IM2COL_ACTIVE_CYCLES = 39
   val IM2COL_TRANSPOSER_WAIT_CYCLE = 40
 
-  val ROB_FULL_CYCLES = 41
-  val ROB_ACTIVE_CYCLES = 42
+  val RESERVATION_STATION_FULL_CYCLES = 41
+  val RESERVATION_STATION_ACTIVE_CYCLES = 42
 
   val LOOP_MATMUL_ACTIVE_CYCLES = 43
   val TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES = 44
@@ -76,14 +76,17 @@ object CounterEvent {
 object CounterExternal {
   val DISABLE = 0
 
-  val ROB_LD_COUNT = 1
-  val ROB_ST_COUNT = 2
-  val ROB_EX_COUNT = 3
+  val RESERVATION_STATION_LD_COUNT = 1
+  val RESERVATION_STATION_ST_COUNT = 2
+  val RESERVATION_STATION_EX_COUNT = 3
 
   val RDMA_BYTES_REC = 4
   val WDMA_BYTES_SENT = 5
 
-  val n = 6
+  val RDMA_TOTAL_LATENCY = 6
+  val WDMA_TOTAL_LATENCY = 7
+
+  val n = 8
 
   val EXTERNAL_WIDTH = 32
 }
@@ -145,6 +148,7 @@ class CounterIO(nPerfCounter: Int, counterWidth: Int) extends Bundle {
   val addr = Input(UInt(log2Ceil(nPerfCounter).W))
   val data = Output(UInt(counterWidth.W))
   val config_address = Flipped(Valid(UInt(log2Ceil(CounterEvent.n).W)))
+  val external = Input(Bool())
 
   val event_io = Flipped(new CounterEventIO)
 }
@@ -156,8 +160,9 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module
 {
   val io = IO(new CounterIO(nPerfCounter, counterWidth))
 
-  val config_width = log2Ceil(scala.math.max(CounterEvent.n, CounterExternal.n)) + 1;
+  val config_width = log2Ceil(scala.math.max(CounterEvent.n, CounterExternal.n)) + 1
   val counter_config = RegInit(VecInit.tabulate(nPerfCounter)(_ => 0.U(config_width.W)))
+  val counter_is_external = Reg(Vec(nPerfCounter, Bool()))
 
   io.event_io.external_reset := io.counter_reset
   withReset(reset.asBool || io.counter_reset) {
@@ -170,9 +175,10 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module
     // local counter
     val take_value = (config: UInt, counter: UInt) => {
       // Set the width
-      val external = Wire(UInt(counterWidth.W))
-      external := io.event_io.external_values(io.addr)
-      Mux(config(config_width - 1), external, counter)
+      val external = io.event_io.external_values(config)
+      val is_external = counter_is_external(io.addr)
+
+      Mux(is_external, external, counter)
     }
     // Snapshot: In case a sequence of access instructions get interrupted (i.e. preempted by OS), it is possible
     // to take a snapshot when reading counter value by setting a bit in the instruction. All subsequent readings
@@ -194,6 +200,7 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module
     // Write configuration reg
     when (io.config_address.valid) {
       counter_config(io.addr) := io.config_address.bits
+      counter_is_external(io.addr) := io.external
       counters(io.addr) := 0.U
     }
 
@@ -241,6 +248,7 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
     module.io.snapshot := io.in.bits.rs1(2) & io.in.fire()
     module.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire()
     module.io.config_address.bits := io.in.bits.rs1(17, 12)
+    module.io.external := io.in.bits.rs1(31)
 
     when (io.out.fire()) {
       out_valid_reg := false.B
diff --git a/src/main/scala/gemmini/CustomCPUConfigs.scala b/src/main/scala/gemmini/CustomCPUConfigs.scala
new file mode 100644
index 00000000..01c32a44
--- /dev/null
+++ b/src/main/scala/gemmini/CustomCPUConfigs.scala
@@ -0,0 +1,20 @@
+/*
+package chipyard
+
+import boom.common._
+import freechips.rocketchip.subsystem._
+
+object CustomGemmminiCPUConfigs {
+  // Default CPU configs
+  type RocketBigCores = WithNBigCores
+  type RocketMedCores = WithNMedCores
+  type RocketSmallCores = WithNSmallCores
+
+  type BoomLargeCores = WithNLargeBooms
+  type BoomMedCores = WithNMediumBooms
+  type BoomSmallCores = WithNMediumBooms
+
+  // Specify which CPU configs you want to build here
+  type CustomCPU = RocketBigCores
+}
+*/
\ No newline at end of file
diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala
new file mode 100644
index 00000000..e1ed7199
--- /dev/null
+++ b/src/main/scala/gemmini/CustomConfigs.scala
@@ -0,0 +1,60 @@
+package gemmini
+
+import chipsalliance.rocketchip.config.{Config, Parameters}
+import chisel3._
+import freechips.rocketchip.diplomacy.LazyModule
+import freechips.rocketchip.subsystem.SystemBusKey
+import freechips.rocketchip.tile.BuildRoCC
+
+
+object GemminiCustomConfigs {
+  // Default configurations
+  val defaultConfig = GemminiConfigs.defaultConfig
+  val defaultFpConfig = GemminiFPConfigs.defaultFPConfig
+
+  // Create your own configs here
+  val baselineInferenceConfig = defaultConfig.copy(
+    has_training_convs = false,
+  )
+
+  val highPerfInferenceConfig = defaultConfig.copy(
+    meshRows = 32,
+    meshColumns = 32,
+
+    has_training_convs = false,
+
+    sp_capacity = CapacityInKilobytes(512),
+    acc_capacity = CapacityInKilobytes(128),
+  )
+
+  val trainingConfig = defaultFpConfig.copy(
+    inputType = Float(expWidth = 8, sigWidth = 24),
+    accType = Float(expWidth = 8, sigWidth = 24),
+
+    meshRows = 8,
+    meshColumns = 8,
+
+    has_training_convs = true,
+    has_max_pool =  false,
+
+    sp_capacity = CapacityInKilobytes(512),
+    acc_capacity = CapacityInKilobytes(128),
+  )
+
+  // Specify which of your custom configs you want to build here
+  val customConfig = baselineInferenceConfig
+}
+
+
+class GemminiCustomConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
+  gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiCustomConfigs.customConfig
+) extends Config((site, here, up) => {
+  case BuildRoCC => up(BuildRoCC) ++ Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      val gemmini = LazyModule(new Gemmini(gemminiConfig))
+      gemmini
+    }
+  )
+  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
+})
diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala
new file mode 100644
index 00000000..aebfb520
--- /dev/null
+++ b/src/main/scala/gemmini/CustomSoCConfigs.scala
@@ -0,0 +1,24 @@
+/*
+package chipyard
+
+import freechips.rocketchip.config.{Config}
+
+class CustomGemminiSoCConfig extends Config(
+  new gemmini.GemminiCustomConfig ++
+
+  // Set your custom L2 configs
+  new chipyard.config.WithL2TLBs(512) ++
+
+  new freechips.rocketchip.subsystem.WithInclusiveCache(
+    nBanks = 1,
+    nWays = 8,
+    capacityKB = 512,
+    outerLatencyCycles = 40
+  ) ++
+
+  // Set the number of CPUs you want to create
+  new chipyard.CustomGemmminiCPUConfigs.CustomCPU(1) ++
+
+  new chipyard.config.AbstractConfig
+)
+*/
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index c8e61c1e..c1cb51ef 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -14,6 +14,9 @@ import freechips.rocketchip.rocket.constants.MemoryOpConstants
 
 import Util._
 
+import midas.targetutils.PerfCounter
+import midas.targetutils.SynthesizePrintf
+
 class StreamReadRequest[U <: Data](spad_rows: Int, acc_rows: Int, mvin_scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle {
   val vaddr = UInt(coreMaxAddrBits.W)
   val spaddr = UInt(log2Up(spad_rows max acc_rows).W) // TODO use LocalAddr in DMA
@@ -48,9 +51,9 @@ class StreamReadResponse[U <: Data](spadWidth: Int, accWidth: Int, spad_rows: In
 }
 
 class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int, spadWidth: Int, accWidth: Int, aligned_to: Int,
-                   spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean)
+                   spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean)
                   (implicit p: Parameters) extends LazyModule {
-  val core = LazyModule(new StreamReaderCore(config, nXacts, beatBits, maxBytes, spadWidth, accWidth, aligned_to, spad_rows, acc_rows, meshRows, use_tlb_register_filter))
+  val core = LazyModule(new StreamReaderCore(config, nXacts, beatBits, maxBytes, spadWidth, accWidth, aligned_to, spad_rows, acc_rows, meshRows, use_tlb_register_filter, use_firesim_simulation_counters))
   val node = core.node
 
   lazy val module = new LazyModuleImp(this) {
@@ -67,7 +70,7 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T
 
     val nCmds = (nXacts / meshRows) + 1
 
-    val xactTracker = Module(new XactTracker(nXacts, maxBytes, spadWidth, accWidth, spad_rows, acc_rows, maxBytes, config.mvin_scale_t_bits, nCmds))
+    val xactTracker = Module(new XactTracker(nXacts, maxBytes, spadWidth, accWidth, spad_rows, acc_rows, maxBytes, config.mvin_scale_t_bits, nCmds, use_firesim_simulation_counters))
 
     val beatPacker = Module(new BeatMerger(beatBits, maxBytes, spadWidth, accWidth, spad_rows, acc_rows, maxBytes, aligned_to, meshRows, config.mvin_scale_t_bits, nCmds))
 
@@ -102,6 +105,7 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T
     io.resp.bits.last := beatPacker.io.out.bits.last
 
     io.counter.collect(core.module.io.counter)
+    io.counter.collect(xactTracker.io.counter)
   }
 }
 
@@ -115,7 +119,8 @@ class StreamReadBeat (val nXacts: Int, val beatBits: Int, val maxReqBytes: Int)
 // TODO StreamReaderCore and StreamWriter are actually very alike. Is there some parent class they could both inherit from?
 class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int,
                                   spadWidth: Int, accWidth: Int, aligned_to: Int,
-                                  spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean)
+                                  spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean,
+                                  use_firesim_simulation_counters: Boolean)
                                  (implicit p: Parameters) extends LazyModule {
   val node = TLHelper.makeClientNode(
     name = "stream-reader", sourceId = IdRange(0, nXacts))
@@ -290,19 +295,32 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
       state := s_req_new_block
     }
 
-
     // Performance counter
     CounterEventIO.init(io.counter)
     io.counter.connectEventSignal(CounterEvent.RDMA_ACTIVE_CYCLE, state =/= s_idle)
-    val bytes_read = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
-    io.counter.connectExternalCounter(CounterExternal.RDMA_BYTES_REC, bytes_read)
-    when (io.counter.external_reset) {
-      bytes_read := 0.U
-    } .elsewhen (tl.d.fire()) {
-      bytes_read := bytes_read + 1.U << tl.d.bits.size
-    }
     io.counter.connectEventSignal(CounterEvent.RDMA_TLB_WAIT_CYCLES, io.tlb.resp.miss)
     io.counter.connectEventSignal(CounterEvent.RDMA_TL_WAIT_CYCLES, tl.a.valid && !tl.a.ready)
+
+    // External counters
+    val total_bytes_read = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
+    when (io.counter.external_reset) {
+      total_bytes_read := 0.U
+    }.elsewhen (tl.d.fire()) {
+      total_bytes_read := total_bytes_read + (1.U << tl.d.bits.size)
+    }
+
+    io.counter.connectExternalCounter(CounterExternal.RDMA_BYTES_REC, total_bytes_read)
+
+    if (use_firesim_simulation_counters) {
+      PerfCounter(state =/= s_idle, "rdma_active_cycles", "cycles during which the read dma is active")
+      PerfCounter(tl.a.ready && translate_q.io.deq.valid && io.tlb.resp.miss, "rdma_tlb_wait_cycles", "cycles during which the read dma is stalling as it waits for a TLB response")
+      PerfCounter(tl.a.valid && !tl.a.ready, "rdma_tl_wait_cycles", "cycles during which the read dma is stalling as it waits for the TileLink port to be available")
+
+      val cntr = Counter(500000)
+      when (cntr.inc()) {
+        printf(SynthesizePrintf("RDMA bytes rec: %d\n", total_bytes_read))
+      }
+    }
   }
 }
 
@@ -319,7 +337,8 @@ class StreamWriteRequest(val dataWidth: Int, val maxBytes: Int)(implicit p: Para
 }
 
 class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: Int, dataWidth: Int, aligned_to: Int,
-                                          inputType: T, block_cols: Int, use_tlb_register_filter: Boolean)
+                                          inputType: T, block_cols: Int, use_tlb_register_filter: Boolean,
+                                          use_firesim_simulation_counters: Boolean)
                   (implicit p: Parameters) extends LazyModule {
   val node = TLHelper.makeClientNode(
     name = "stream-writer", sourceId = IdRange(0, nXacts))
@@ -584,14 +603,36 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     // Performance counter
     CounterEventIO.init(io.counter)
     io.counter.connectEventSignal(CounterEvent.WDMA_ACTIVE_CYCLE, state =/= s_idle)
-    val bytes_sent = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
-    io.counter.connectExternalCounter(CounterExternal.WDMA_BYTES_SENT, bytes_sent)
-    when (io.counter.external_reset) {
-      bytes_sent := 0.U
-    } .elsewhen (tl.d.fire()) {
-      bytes_sent := bytes_sent + 1.U << tl.d.bits.size
-    }
     io.counter.connectEventSignal(CounterEvent.WDMA_TLB_WAIT_CYCLES, io.tlb.resp.miss)
     io.counter.connectEventSignal(CounterEvent.WDMA_TL_WAIT_CYCLES, tl.a.valid && !tl.a.ready)
+
+    // External counters
+    val total_bytes_sent = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
+    when (tl.d.fire()) {
+      total_bytes_sent := total_bytes_sent + (1.U << tl.d.bits.size)
+    }
+
+    val total_latency = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
+    total_latency := total_latency + PopCount(xactBusy)
+
+    when (io.counter.external_reset) {
+      total_bytes_sent := 0.U
+      total_latency := 0.U
+    }
+
+    io.counter.connectExternalCounter(CounterExternal.WDMA_BYTES_SENT, total_bytes_sent)
+    io.counter.connectExternalCounter(CounterExternal.WDMA_TOTAL_LATENCY, total_latency)
+
+    if (use_firesim_simulation_counters) {
+      PerfCounter(state =/= s_idle, "wdma_active_cycles", "cycles during which write read dma is active")
+      PerfCounter(tl.a.ready && translate_q.io.deq.valid && io.tlb.resp.miss, "wdma_tlb_wait_cycles", "cycles during which the write dma is stalling as it waits for a TLB response")
+      PerfCounter(tl.a.valid && !tl.a.ready, "wdma_tl_wait_cycles", "cycles during which the write dma is stalling as it waits for the TileLink port to be available")
+
+      val cntr = Counter(500000)
+      when(cntr.inc()) {
+        printf(SynthesizePrintf("WDMA bytes sent: %d\n", total_bytes_sent))
+        printf(SynthesizePrintf("WDMA total latency: %d\n", total_latency))
+      }
+    }
   }
 }
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index 1c9f3b2f..37fc70f4 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -21,30 +21,29 @@ object DSEBaseConfig {
     ld_queue_length = 4,
     st_queue_length = 2,
     ex_queue_length = 8,
-    rob_full_entries = 8,
-    rob_partial_entries = 1,
+    reservation_station_full_entries = 8,
+    reservation_station_partial_entries = 1,
 
     sp_banks = 4, // TODO support one-bank designs
     acc_banks = 1,
     acc_singleported = false,
-    num_acc_sub_banks = -1,
+    acc_sub_banks = -1,
     sp_capacity = CapacityInKilobytes(64),
     sp_singleported = false,
     shifter_banks = 1, // TODO add separate parameters for left and up shifter banks
     dataflow = Dataflow.OS,
     acc_capacity = CapacityInKilobytes(16),
-    mem_pipeline = 1,
+    spad_read_delay = 1,
     dma_maxbytes = 128, // TODO get this from cacheblockbytes
     dma_buswidth = 128, // TODO get this from SystemBusKey
     aligned_to = 16,
-    hasIm2col = false,
     inputType = SInt(8.W),
-    outputType = SInt(19.W),
+    spatialArrayOutputType = SInt(19.W),
     accType = SInt(32.W),
     mvin_scale_args = None,
     mvin_scale_acc_args = None,
     mvin_scale_shared = false,
-    acc_scale_args = ScaleArguments(
+    acc_scale_args = Some(ScaleArguments(
       (t: SInt, u: UInt) => {
         // The equation we use can be found here: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
 
@@ -56,7 +55,7 @@ object DSEBaseConfig {
         val r = (point_five & (zeros | ones_digit)).asBool()
 
         (t >> u).asSInt() + Mux(r, 1.S, 0.S)
-      }, 0, UInt(8.W), -1),
+      }, 0, UInt(8.W), -1)),
     acc_read_full_width = true,
     acc_read_small_width = true,
     use_dedicated_tl_port = false,
@@ -71,10 +70,14 @@ object DSEBaseConfig {
 
     tlb_size = 4,
     use_tlb_register_filter = true,
-    max_in_flight_reqs = 16,
+    max_in_flight_mem_reqs = 16,
 
     mesh_output_delay = 1,
 
+    has_training_convs = false,
+    has_max_pool = true,
+    has_nonlinear_activations = true,
+
     num_counter = 8,
   )
 }
@@ -84,9 +87,9 @@ object DSEConfigs{
   val baseConfig = base.copy(headerFileName = "gemmini_params_dse1.h")
   val wsOnlyConfig = baseConfig.copy(dataflow = Dataflow.WS, headerFileName = "gemmini_params_dse2.h")
   val bothDataflowsConfig = baseConfig.copy(dataflow = Dataflow.BOTH, headerFileName = "gemmini_params_dse3.h")
-  val highBitwidthConfig = baseConfig.copy(inputType = SInt(32.W), outputType = SInt(32.W),
+  val highBitwidthConfig = baseConfig.copy(inputType = SInt(32.W), spatialArrayOutputType = SInt(32.W),
     headerFileName = "gemmini_params_dse4.h")
-  val largerDimConfig = baseConfig.copy(meshRows = 32, meshColumns = 32, outputType = SInt(20.W),
+  val largerDimConfig = baseConfig.copy(meshRows = 32, meshColumns = 32, spatialArrayOutputType = SInt(20.W),
     headerFileName = "gemmini_params_dse5.h")
   val fullyCombinationalConfig = baseConfig.copy(tileRows = 16, tileColumns = 16, meshRows = 1, meshColumns = 1,
     headerFileName = "gemmini_params_dse6.h")
@@ -97,7 +100,7 @@ object DSEConfigs{
   val pnr16Config = baseConfig.copy(sp_capacity = CapacityInKilobytes(256), acc_capacity = CapacityInKilobytes(64),
     dataflow = Dataflow.BOTH, headerFileName = "gemmini_params_pnr16.h")
   val pnr32Config = baseConfig.copy(sp_capacity = CapacityInKilobytes(512), acc_capacity = CapacityInKilobytes(128),
-    meshRows = 32, meshColumns = 32, outputType = SInt(20.W), dataflow = Dataflow.BOTH,
+    meshRows = 32, meshColumns = 32, spatialArrayOutputType = SInt(20.W), dataflow = Dataflow.BOTH,
     headerFileName = "gemmini_params_pnr32.h")
 }
 
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index db9a894e..9d1cf094 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -6,6 +6,7 @@ import chisel3.util._
 import GemminiISA._
 import Util._
 import freechips.rocketchip.config.Parameters
+import midas.targetutils.PerfCounter
 
 // TODO do we still need to flush when the dataflow is weight stationary? Won't the result just keep travelling through on its own?
 class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: Int, config: GemminiArrayConfig[T, U, V])
@@ -28,7 +29,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
 
     val acc = new Bundle {
       val read_req = Vec(acc_banks, Decoupled(new AccumulatorReadReq(
-          acc_bank_entries, log2Up(accType.getWidth), acc_scale_args.multiplicand_t
+          acc_bank_entries, log2Up(accType.getWidth), acc_scale_t
       )))
 
       val read_resp = Flipped(Vec(acc_banks, Decoupled(new AccumulatorScaleResp(
@@ -116,9 +117,9 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val im2col_turn = WireInit(0.U(9.W))
 
   val in_shift = Reg(UInt(log2Up(accType.getWidth).W))
-  val acc_scale = Reg(acc_scale_args.multiplicand_t)
+  val acc_scale = Reg(acc_scale_t)
   val relu6_shift = Reg(UInt(log2Up(accType.getWidth).W))
-  val activation = Reg(UInt(2.W)) // TODO magic number
+  val activation = if (has_nonlinear_activations) Reg(UInt(2.W)) else Activation.NONE // TODO magic number
   val a_transpose = Reg(Bool())
   val bd_transpose = Reg(Bool())
   val config_initialized = RegInit(false.B)
@@ -136,7 +137,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     "Too many inputs are being fed into the single transposer we have")
 
   //fix by input
-  val im2col_en = hasIm2col.B && weight_stride =/= 0.U
+  val im2col_en = config.hasIm2Col.B && weight_stride =/= 0.U
 
   // SRAM addresses of matmul operands
   val a_address_rs1 = rs1s(a_address_place).asTypeOf(local_addr_t)
@@ -178,7 +179,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val pending_completed_rob_ids = Reg(Vec(2, UDValid(UInt(log2Up(rob_entries).W))))
 
   // Instantiate a queue which queues up signals which must be fed into the mesh
-  val mesh_cntl_signals_q = Module(new Queue(new ComputeCntlSignals, mem_pipeline+1,
+  val mesh_cntl_signals_q = Module(new Queue(new ComputeCntlSignals, spad_read_delay+1,
     pipe=true))
 
   val cntl_ready = mesh_cntl_signals_q.io.enq.ready
@@ -186,7 +187,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val cntl = mesh_cntl_signals_q.io.deq.bits
 
   // Instantiate the actual mesh
-  val mesh = Module(new MeshWithDelays(inputType, outputType, accType, mesh_tag, dataflow, pe_latency, mesh_output_delay,
+  val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, pe_latency, mesh_output_delay,
     tileRows, tileColumns, meshRows, meshColumns, shifter_banks, shifter_banks))
 
   mesh.io.a.valid := false.B
@@ -547,9 +548,11 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
             val set_only_strides = config_ex_rs1.set_only_strides
 
             when (!set_only_strides) {
-              activation := config_ex_rs1.activation
+              if (has_nonlinear_activations) {
+                activation := config_ex_rs1.activation
+              }
               in_shift := config_ex_rs2.in_shift
-              acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_args.multiplicand_t) // TODO magic number
+              acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_t) // TODO magic number
               relu6_shift := config_ex_rs2.relu6_shift
               a_transpose := config_ex_rs1.a_transpose
               bd_transpose := config_ex_rs1.b_transpose
@@ -1025,4 +1028,14 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     !(!cntl.b_fire || mesh.io.b.fire() || !mesh.io.b.ready) && !cntl.b_read_from_acc)
   io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_D_WAIT_CYCLE, 
     !(!cntl.d_fire || mesh.io.d.fire() || !mesh.io.d.ready) && !cntl.d_read_from_acc)
+
+  if (use_firesim_simulation_counters) {
+    val ex_flush_cycle = control_state === flushing || control_state === flush
+    val ex_preload_haz_cycle = cmd.valid(0) && DoPreloads(0) && cmd.valid(1) && raw_hazard_pre
+    val ex_mulpre_haz_cycle = cmd.valid(0) && DoPreloads(1) && cmd.valid(1) && DoComputes(0) && cmd.valid(2) && raw_hazard_mulpre
+
+    PerfCounter(ex_flush_cycle, "ex_flush_cycle", "cycles during which the ex controller is flushing the spatial array")
+    PerfCounter(ex_preload_haz_cycle, "ex_preload_haz_cycle", "cycles during which the execute controller is stalling preloads due to hazards")
+    PerfCounter(ex_mulpre_haz_cycle, "ex_mulpre_haz_cycle", "cycles during which the execute controller is stalling matmuls due to hazards")
+  }
 }
diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index f5643c92..50c393b5 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -7,10 +7,11 @@ import freechips.rocketchip.config.Parameters
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile.{CoreBundle, CoreModule}
 import freechips.rocketchip.tilelink.TLEdgeOut
-import freechips.rocketchip.util.InOrderArbiter
 
 import Util._
 
+import midas.targetutils.PerfCounter
+
 class DecoupledTLBReq(val lgMaxSize: Int)(implicit p: Parameters) extends CoreBundle {
   val tlb_req = new TLBReq(lgMaxSize)
   val status = new MStatus
@@ -25,7 +26,7 @@ class TLBExceptionIO extends Bundle {
 }
 
 // TODO can we make TLB hits only take one cycle?
-class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Parameters)
+class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters: Boolean)(implicit edge: TLEdgeOut, p: Parameters)
   extends CoreModule {
 
   val lgMaxSize = log2Ceil(maxSize)
@@ -68,6 +69,12 @@ class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Para
   io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, RegNext(io.req.fire()) && !tlb.io.resp.miss)
   io.counter.connectEventSignal(CounterEvent.DMA_TLB_TOTAL_REQ, io.req.fire())
   io.counter.connectEventSignal(CounterEvent.DMA_TLB_MISS_CYCLE, tlb.io.resp.miss)
+
+  if (use_firesim_simulation_counters) {
+    PerfCounter(RegNext(io.req.fire()) && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits")
+    PerfCounter(io.req.fire(), "tlb_reqs", "total number of tlb reqs")
+    PerfCounter(tlb.io.resp.miss, "tlb_miss_cycles", "total number of cycles where the tlb is resolving a miss")
+  }
 }
 
 class FrontendTLBIO(implicit p: Parameters) extends CoreBundle {
@@ -77,7 +84,7 @@ class FrontendTLBIO(implicit p: Parameters) extends CoreBundle {
   val resp = Flipped(new TLBResp)
 }
 
-class FrontendTLB(nClients: Int, entries: Int, maxSize: Int)
+class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean)
                  (implicit edge: TLEdgeOut, p: Parameters) extends CoreModule {
   val io = IO(new Bundle {
     val clients = Flipped(Vec(nClients, new FrontendTLBIO))
@@ -88,7 +95,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int)
 
   val lgMaxSize = log2Ceil(coreDataBytes)
   val tlbArb = Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients))
-  val tlb = Module(new DecoupledTLB(entries, maxSize))
+  val tlb = Module(new DecoupledTLB(entries, maxSize, use_firesim_simulation_counters))
   tlb.io.req.valid := tlbArb.io.out.valid
   tlb.io.req.bits := tlbArb.io.out.bits
   tlbArb.io.out.ready := true.B
@@ -123,6 +130,11 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int)
     } .otherwise {
       client.resp := tlb.io.resp
     }
+
+    // If we're not using the TLB filter register, then we set this value to always be false
+    if (!use_tlb_register_filter) {
+      last_translated_valid := false.B
+    }
   }
 
   io.counter.collect(tlb.io.counter)
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 3067f00d..45c481ce 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -15,58 +15,74 @@ case class ScaleArguments[T <: Data, U <: Data](scale_func: (T, U) => T, latency
                                                 identity: String="0", c_str: String="ROUNDING_RIGHT_SHIFT(x, scale)")
 
 case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
-                                                                             opcodes: OpcodeSet,
-                                                                             tileRows: Int,
-                                                                             tileColumns: Int,
-                                                                             meshRows: Int,
-                                                                             meshColumns: Int,
-                                                                             ld_queue_length: Int,
-                                                                             st_queue_length: Int,
-                                                                             ex_queue_length: Int,
-                                                                             rob_full_entries: Int,
-                                                                             rob_partial_entries: Int,
-                                                                             sp_banks: Int, // TODO support one-bank designs
-                                                                             sp_singleported: Boolean,
-                                                                             sp_capacity: GemminiMemCapacity,
-                                                                             acc_banks: Int,
-                                                                             acc_singleported: Boolean,
-                                                                             num_acc_sub_banks: Int,
-                                                                             acc_capacity: GemminiMemCapacity,
-                                                                             shifter_banks: Int,
-                                                                             dataflow: Dataflow.Value,
-                                                                             mem_pipeline: Int,
-                                                                             dma_maxbytes: Int,
-                                                                             dma_buswidth: Int,
-                                                                             aligned_to: Int, // TODO we should align to inputType and accType instead
                                                                              inputType: T,
-                                                                             outputType: T,
+                                                                             spatialArrayOutputType: T,
                                                                              accType: T,
-                                                                             mvin_scale_args: Option[ScaleArguments[T, U]],
-                                                                             mvin_scale_acc_args: Option[ScaleArguments[T, U]],
-                                                                             mvin_scale_shared: Boolean,
-                                                                             acc_scale_args: ScaleArguments[T, V],
-                                                                             hasIm2col: Boolean,
-                                                                             pe_latency: Int,
-                                                                             acc_read_full_width: Boolean,
-                                                                             acc_read_small_width: Boolean,
-                                                                             use_dedicated_tl_port: Boolean,
-                                                                             // enable_a_transpose: Boolean,
-                                                                             // enable_b_transpose: Boolean,
 
-                                                                             tlb_size: Int,
-                                                                             use_tlb_register_filter: Boolean,
-                                                                             max_in_flight_reqs: Int,
+                                                                             opcodes: OpcodeSet = OpcodeSet.custom3,
 
-                                                                             ex_read_from_spad: Boolean,
-                                                                             ex_read_from_acc: Boolean,
-                                                                             ex_write_to_spad: Boolean,
-                                                                             ex_write_to_acc: Boolean,
+                                                                             dataflow: Dataflow.Value = Dataflow.BOTH,
 
-                                                                             hardcode_d_to_garbage_addr: Boolean,
+                                                                             tileRows: Int = 1,
+                                                                             tileColumns: Int = 1,
+                                                                             meshRows: Int = 16,
+                                                                             meshColumns: Int = 16,
 
-                                                                             mesh_output_delay: Int,
+                                                                             ld_queue_length: Int = 8,
+                                                                             st_queue_length: Int = 2,
+                                                                             ex_queue_length: Int = 8,
 
-                                                                             num_counter: Int,
+                                                                             reservation_station_full_entries: Int = 16,
+                                                                             reservation_station_partial_entries: Int = 8,
+
+                                                                             sp_banks: Int = 4, // TODO support one-bank designs
+                                                                             sp_singleported: Boolean = false,
+                                                                             sp_capacity: GemminiMemCapacity = CapacityInKilobytes(256),
+                                                                             spad_read_delay: Int = 4,
+
+                                                                             acc_banks: Int = 2,
+                                                                             acc_singleported: Boolean = false,
+                                                                             acc_sub_banks: Int = -1,
+                                                                             acc_capacity: GemminiMemCapacity = CapacityInKilobytes(64),
+
+                                                                             dma_maxbytes: Int = 64, // TODO get this from cacheblockbytes
+                                                                             dma_buswidth: Int = 128, // TODO get this from SystemBusKey
+
+                                                                             shifter_banks: Int = 1, // TODO add separate parameters for left and up shifter banks
+
+                                                                             aligned_to: Int = 1, // TODO we should align to inputType and accType instead
+
+                                                                             mvin_scale_args: Option[ScaleArguments[T, U]] = None,
+                                                                             mvin_scale_acc_args: Option[ScaleArguments[T, U]] = None,
+                                                                             mvin_scale_shared: Boolean = false,
+                                                                             acc_scale_args: Option[ScaleArguments[T, V]] = None,
+
+                                                                             pe_latency: Int = 0,
+
+                                                                             acc_read_full_width: Boolean = true,
+                                                                             acc_read_small_width: Boolean = true,
+                                                                             use_dedicated_tl_port: Boolean = true,
+
+                                                                             tlb_size: Int = 4,
+                                                                             use_tlb_register_filter: Boolean = true,
+                                                                             max_in_flight_mem_reqs: Int = 16,
+
+                                                                             ex_read_from_spad: Boolean = true,
+                                                                             ex_read_from_acc: Boolean = true,
+                                                                             ex_write_to_spad: Boolean = true,
+                                                                             ex_write_to_acc: Boolean = true,
+
+                                                                             hardcode_d_to_garbage_addr: Boolean = false,
+
+                                                                             mesh_output_delay: Int = 1,
+
+                                                                             num_counter: Int = 8,
+
+                                                                             has_training_convs: Boolean = true,
+                                                                             has_max_pool: Boolean = true,
+                                                                             has_nonlinear_activations: Boolean = true,
+
+                                                                             use_firesim_simulation_counters: Boolean = false,
 
                                                                              headerFileName: String = "gemmini_params.h"
                                                        ) {
@@ -79,7 +95,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
     case CapacityInKilobytes(kb) => kb * 1024 * 8 / (acc_banks * meshColumns * tileColumns * accType.getWidth)
     case CapacityInMatrices(ms) => ms * meshRows * tileRows / acc_banks
   }
-  require (!acc_singleported || (num_acc_sub_banks <= 4 && isPow2(num_acc_sub_banks)))
+  require (!acc_singleported || (acc_sub_banks <= 4 && isPow2(acc_sub_banks)))
 
   val local_addr_t = new LocalAddr(sp_banks, sp_bank_entries, acc_banks, acc_bank_entries)
 
@@ -93,13 +109,44 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
     case None => Bool() // TODO replace this with UInt(0.W)
   }
 
-  val acc_scale_t = acc_scale_args.multiplicand_t
-
   val mvin_scale_t_bits = mvin_scale_t.getWidth max mvin_scale_acc_t.getWidth
   val mvin_scale_same = (mvin_scale_args.isEmpty && mvin_scale_acc_args.isEmpty) || mvin_scale_shared
 
+  // If the user doesn't specify an "acc_scale_args", then for now, we will still say in the header file that
+  // acc_scale_t is Float32. TODO: don't put an acc_scale_t in the header file at all if the user doesn't specify one
+  val acc_scale_t = acc_scale_args match {
+    case Some(args) => args.multiplicand_t
+    case None => Float(8, 24)
+  }
+
   val acc_scale_t_bits = acc_scale_t.getWidth
 
+  val acc_scale_identity = acc_scale_args match {
+    case Some(args) => args.identity
+    case None => "0"
+  }
+
+  val acc_scale_c_str = acc_scale_args match {
+    case Some(args) => args.c_str
+    case None => "(x)"
+  }
+
+  val acc_scale_func = acc_scale_args match {
+    case Some(args) => args.scale_func
+    case None => (t: T, _: V) => t
+  }
+
+  val acc_scale_num_units = acc_scale_args match {
+    case Some(args) => args.num_scale_units
+    case None => -1
+  }
+
+  val acc_scale_latency = acc_scale_args match {
+    case Some(args) => args.latency
+    case None => 1
+  }
+  assert(acc_scale_latency > 0)
+
   val mvin_cols_bits = log2Up(((dma_maxbytes / (inputType.getWidth / 8)) max (meshColumns * tileColumns)) + 1)
   val mvin_rows_bits = log2Up(meshRows * tileRows + 1)
   val mvout_cols_bits = log2Up(((dma_maxbytes / (inputType.getWidth / 8)) max (meshColumns * tileColumns)) + 1)
@@ -108,6 +155,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   val load_states = 3
   val block_stride_bits = 16
 
+  val hasIm2Col = false
+
   //==========================================================================
   // sanity check mesh size
   //==========================================================================
@@ -123,7 +172,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   //==========================================================================
   // cisc-gemmini miscellaneous constants (some redundant with above)
   //==========================================================================
-  val rob_entries      = rob_full_entries + rob_partial_entries
+  val rob_entries      = reservation_station_full_entries + reservation_station_partial_entries
   val ROB_ENTRIES      = rob_entries
   val LOG2_ROB_ENTRIES = log2Up(rob_entries)
 
@@ -206,7 +255,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       assert(dataType.getWidth <= 32) // Above 32 bits, we need to append UL to the number, which isn't done yet
 
       dataType match {
-        case dt: UInt => ("0", BigInt(2).pow(dt.getWidth).-(1).toString)
         case dt: SInt => ("-" + BigInt(2).pow(dt.getWidth - 1).toString, BigInt(2).pow(dt.getWidth - 1).-(1).toString)
         case dt: Float =>
           (dt.expWidth, dt.sigWidth) match {
@@ -214,13 +262,13 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
             case (11, 53) => (scala.Double.MinValue.toString, scala.Double.MaxValue.toString)
             case _ => (((Range(-1,-(dt.sigWidth),-1).map(-Math.pow(2, _)).foldLeft(-1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString, ((Range(-1,-(dt.sigWidth),-1).map(Math.pow(2, _)).foldLeft(1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString)
           }
-        case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown")
+        case dt => ("0", BigInt(2).pow(dt.getWidth).-(1).toString)
+        // case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown")
       }
     }
 
     def c_type(dataType: Data): String = {
       dataType match {
-        case dt: UInt => s"uint${dt.getWidth}_t"
         case dt: SInt => s"int${dt.getWidth}_t"
         case dt: Float =>
           (dt.expWidth, dt.sigWidth) match {
@@ -228,16 +276,17 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
             case (11, 53) => "double"
             case _ => s"uint" + (Math.pow(2, Math.ceil(Math.log(dt.expWidth + dt.sigWidth)/Math.log(2.0)))).toInt.toString + s"_t"
           }
-        case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown")
+        case dt => s"uint${dt.getWidth}_t"
       }
     }
 
     def full_c_type(dataType: Data): String = {
       dataType match {
-        case dt: UInt => "uint64_t"
-        case dt: SInt => "int64_t"
-        case dt: Float => "double"
-        case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown")
+        case _: UInt => "uint64_t"
+        case _: SInt => "int64_t"
+        case _: Float => "double"
+        case _ => "uint64_t"
+        // case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown")
       }
     }
 
@@ -246,7 +295,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
     // assert(Set(8, 16, 32, 64).contains(outputType.getWidth))
     assert(Set(8, 16, 32, 64).contains(accType.getWidth))
 
-
     val header = new StringBuilder()
     header ++= s"#ifndef $guard\n"
     header ++= s"#define $guard\n\n"
@@ -311,7 +359,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       header ++= "#define HAS_MVIN_SCALE\n"
       header ++= s"typedef ${c_type(mvin_scale_args.get.multiplicand_t)} scale_t;\n"
       header ++= s"typedef ${c_type(UInt(mvin_scale_args.get.multiplicand_t.getWidth.W))} scale_t_bits;\n\n"
-
     } else {
       header ++= s"typedef int32_t scale_t;\n"
       header ++= s"typedef uint32_t scale_t_bits;\n\n"
@@ -321,14 +368,13 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       header ++= "#define HAS_MVIN_ACC_SCALE\n"
       header ++= s"typedef ${c_type(mvin_scale_acc_args.get.multiplicand_t)} scale_acc_t;\n"
       header ++= s"typedef ${c_type(UInt(mvin_scale_acc_args.get.multiplicand_t.getWidth.W))} scale_acc_t_bits;\n\n"
-
     } else {
       header ++= s"typedef int32_t scale_acc_t;\n"
       header ++= s"typedef uint32_t scale_acc_t_bits;\n\n"
     }
 
-    header ++= s"typedef ${c_type(acc_scale_args.multiplicand_t)} acc_scale_t;\n"
-    header ++= s"typedef ${c_type(UInt(acc_scale_args.multiplicand_t.getWidth.W))} acc_scale_t_bits;\n\n"
+    header ++= s"typedef ${c_type(acc_scale_t)} acc_scale_t;\n"
+    header ++= s"typedef ${c_type(UInt(acc_scale_t_bits.W))} acc_scale_t_bits;\n\n"
 
     header ++= s"#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))\n"
     header ++= s"#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))\n\n"
@@ -338,7 +384,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       case None => "0"
     }
     header ++= s"#define MVIN_SCALE_IDENTITY $mvin_scale_identity\n\n"
-    header ++= s"#define ACC_SCALE_IDENTITY ${acc_scale_args.identity}\n\n"
+    header ++= s"#define ACC_SCALE_IDENTITY ${acc_scale_identity}\n\n"
 
     if (inputType.isInstanceOf[Float]) {
       header ++= """#define ROUNDING_RIGHT_SHIFT(x, shift) \
@@ -380,7 +426,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
 
     header ++= """#define ACC_SCALE(x, scale) \
 """
-    header ++= s"    ${acc_scale_args.c_str}"
+    header ++= s"    ${acc_scale_c_str}"
     header ++= "\n\n"
 
     if (mvin_scale_args.isDefined) {
@@ -388,6 +434,10 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
         s"""#define MVIN_SCALE(x, scale) \\
     ${mvin_scale_args.get.c_str}"""
       header ++= "\n\n"
+    } else {
+      header ++=
+        s"""#define MVIN_SCALE(x, scale) (x)"""
+      header ++= "\n\n"
     }
 
     if (mvin_scale_acc_args.isDefined) {
@@ -395,12 +445,16 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
         s"""#define MVIN_SCALE_ACC(x, scale) \\
     ${mvin_scale_acc_args.get.c_str}"""
       header ++= "\n\n"
+    } else {
+      header ++=
+        s"""#define MVIN_SCALE_ACC(x, scale) (x)"""
+      header ++= "\n\n"
     }
 
-    if (acc_scale_args.multiplicand_t.isInstanceOf[Float]) {
+    if (acc_scale_t.isInstanceOf[Float]) {
       header ++= "#define ACC_SCALE_T_IS_FLOAT\n"
-      header ++= s"#define ACC_SCALE_EXP_BITS ${acc_scale_args.multiplicand_t.asInstanceOf[Float].expWidth}\n"
-      header ++= s"#define ACC_SCALE_SIG_BITS ${acc_scale_args.multiplicand_t.asInstanceOf[Float].sigWidth}\n\n"
+      header ++= s"#define ACC_SCALE_EXP_BITS ${acc_scale_t.asInstanceOf[Float].expWidth}\n"
+      header ++= s"#define ACC_SCALE_SIG_BITS ${acc_scale_t.asInstanceOf[Float].sigWidth}\n\n"
     }
 
     if (acc_read_small_width)
@@ -436,5 +490,4 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       s"$default_directory/$headerFileName"
     }
   }
-
 }
diff --git a/src/main/scala/gemmini/Im2Col.scala b/src/main/scala/gemmini/Im2Col.scala
index 5088712c..2c7f8cbf 100644
--- a/src/main/scala/gemmini/Im2Col.scala
+++ b/src/main/scala/gemmini/Im2Col.scala
@@ -135,7 +135,7 @@ class Im2Col[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V
 
   val im2col_en_d = RegNext(im2col_en)
 
-  val sram_read_signals_q = Module(new Queue(new im2colRowSignals, mem_pipeline+1,
+  val sram_read_signals_q = Module(new Queue(new im2colRowSignals, spad_read_delay+1,
     pipe=true))
 
   io.sram_reads.foreach { sr =>
@@ -444,7 +444,7 @@ class Im2Col[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V
   sram_read_signals_q.io.enq.bits.sram_bank := im2col_spad_bank
 
   sram_read_signals_q.io.deq.ready := true.B//sram_resp_valid
-  if(!hasIm2col){ //to default values
+  if(!config.hasIm2Col){ //to default values
     io.resp.valid := false.B
     io.req.ready := true.B
     io.sram_reads.foreach(_.req.valid := false.B)
diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala
index 1c8b0ced..89f7be7c 100644
--- a/src/main/scala/gemmini/LoadController.scala
+++ b/src/main/scala/gemmini/LoadController.scala
@@ -6,6 +6,7 @@ import chisel3.util._
 import GemminiISA._
 import Util._
 import freechips.rocketchip.config.Parameters
+import midas.targetutils.PerfCounter
 
 // TODO we need to check for WAW errors here
 // TODO deal with errors when reading scratchpad responses
@@ -76,7 +77,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   cmd.ready := false.B
 
   // Command tracker instantiation
-  val nCmds = (max_in_flight_reqs / block_rows) + 1
+  val nCmds = (max_in_flight_mem_reqs / block_rows) + 1
 
   val deps_t = new Bundle {
     val rob_id = UInt(log2Up(rob_entries).W)
@@ -134,7 +135,6 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   switch (control_state) {
     is (waiting_for_command) {
       when (cmd.valid) {
-        // when(DoConfig && !cmd_tracker.io.cmd_completed.valid) {
         when(DoConfig) {
           stride := config_stride
           scale := config_scale
@@ -170,4 +170,11 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   io.counter.connectEventSignal(CounterEvent.LOAD_ACTIVE_CYCLE, control_state === sending_rows)
   io.counter.connectEventSignal(CounterEvent.LOAD_DMA_WAIT_CYCLE, control_state === waiting_for_dma_req_ready)
   io.counter.connectEventSignal(CounterEvent.LOAD_SCRATCHPAD_WAIT_CYCLE, io.dma.req.valid && !io.dma.req.ready)
+
+  if (use_firesim_simulation_counters) {
+    PerfCounter(io.dma.req.valid && !io.dma.req.ready, "load_dma_wait_cycle", "cycles during which load controller is waiting for DMA to be available")
+  }
+
+  // Assertions
+  assert(!(cmd_tracker.io.alloc.fire() && cmd_tracker.io.alloc.bits.bytes_to_read === 0.U), "A single mvin instruction must load more than 0 bytes")
 }
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 749f00fe..47cd5a39 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -274,8 +274,9 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
     req.trans_input_3120 -> (req.dram_addr +& (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt())
   ))
   val spad_addr = Mux(req.trans_input_3120,
-    req.addr_start.zext() +& (b / block_size.S) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample),
-    req.addr_start.zext() +& (ich / block_size.S) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample))
+    // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)"
+    req.addr_start.zext() +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample),
+    req.addr_start.zext() +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample))
 
   // Sizes
   val block_size_downsampled = (block_size.U << req.downsample).asUInt().zext()
@@ -1134,7 +1135,8 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
   max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
   config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2,
   config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
-  compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)
+  compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs,
+  has_training_convs: Boolean, has_max_pool: Boolean)
   (implicit p: Parameters) extends Module {
   val large_iterator_bitwidth = 16
   val small_iterator_bitwidth = 16 // 8
@@ -1239,9 +1241,9 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
       is (LOOP_CONV_WS_CONFIG_2) {
         loop_being_configured.outer_bounds.kernel_dim := cmd.bits.rs1(63, 48)
-        loop_being_configured.outer_bounds.pool_size := cmd.bits.rs1(47, 32)
-        loop_being_configured.outer_bounds.pool_stride := cmd.bits.rs1(31, 16)
-        loop_being_configured.outer_bounds.pool_padding := cmd.bits.rs1(15, 0)
+        loop_being_configured.outer_bounds.pool_size := (if (!has_max_pool) 1.U else cmd.bits.rs1(47, 32))
+        loop_being_configured.outer_bounds.pool_stride := (if (!has_max_pool) 1.U else cmd.bits.rs1(31, 16))
+        loop_being_configured.outer_bounds.pool_padding := (if (!has_max_pool) 0.U else cmd.bits.rs1(15, 0))
 
         loop_being_configured.inner_bounds.batches := cmd.bits.rs2(63, 48)
         loop_being_configured.inner_bounds.porows := cmd.bits.rs2(47, 32)
@@ -1285,17 +1287,19 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
       is (LOOP_CONV_WS) {
         loop_being_configured.no_bias := cmd.bits.rs1(0)
-        loop_being_configured.wrot180 := cmd.bits.rs1(1)
-        loop_being_configured.trans_output_1203 := cmd.bits.rs1(2)
-        loop_being_configured.trans_weight_1203 := cmd.bits.rs1(3)
-        loop_being_configured.trans_weight_0132 := cmd.bits.rs1(4)
-        loop_being_configured.trans_input_3120 := cmd.bits.rs1(5)
 
-        loop_being_configured.no_pool := cmd.bits.rs2(0)
-        loop_being_configured.downsample := cmd.bits.rs2(1)
-        loop_being_configured.input_dilated := cmd.bits.rs2(2)
+        loop_being_configured.wrot180 := has_training_convs.B && cmd.bits.rs1(1)
+        loop_being_configured.input_dilated := has_training_convs.B && cmd.bits.rs2(2)
+        loop_being_configured.trans_output_1203 := has_training_convs.B && cmd.bits.rs1(2)
+        loop_being_configured.trans_weight_1203 := has_training_convs.B && cmd.bits.rs1(3)
+        loop_being_configured.trans_weight_0132 := has_training_convs.B && cmd.bits.rs1(4)
+        loop_being_configured.trans_input_3120 := has_training_convs.B && cmd.bits.rs1(5)
+
+        loop_being_configured.no_pool := !has_max_pool.B || cmd.bits.rs2(0)
         loop_being_configured.activation := cmd.bits.rs2(4,3)
 
+        loop_being_configured.downsample := cmd.bits.rs2(1)
+
         loop_being_configured.configured := true.B
 
         // assert(!loop_being_configured.input_dilated || loop_being_configured.outer_bounds.stride === 1.U)
@@ -1460,12 +1464,14 @@ object LoopConv {
             max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
             config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2,
             mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
-            compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)
+            compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean)
            (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = {
+
     val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts,
       max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes,
       config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t,
-      compute_rs1_t, compute_rs2_t))
+      compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool))
+
     mod.io.in <> in
     mod.io.ld_utilization := ld_utilization
     mod.io.st_utilization := st_utilization
diff --git a/src/main/scala/gemmini/ROB.scala b/src/main/scala/gemmini/ReservationStation.scala
similarity index 87%
rename from src/main/scala/gemmini/ROB.scala
rename to src/main/scala/gemmini/ReservationStation.scala
index 4ee23f6a..929685f6 100644
--- a/src/main/scala/gemmini/ROB.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -8,8 +8,12 @@ import freechips.rocketchip.util.PlusArg
 import GemminiISA._
 import Util._
 
+import midas.targetutils.PerfCounter
+import midas.targetutils.SynthesizePrintf
+
+
 // TODO unify this class with GemminiCmdWithDeps
-class ROBIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bundle {
+class ReservationStationIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bundle {
   val valid = Output(Bool())
   val ready = Input(Bool())
   val cmd = Output(cmd_t.cloneType)
@@ -17,11 +21,11 @@ class ROBIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bundle {
 
   def fire(dummy: Int=0) = valid && ready
 
-  override def cloneType: this.type = new ROBIssue(cmd_t, rob_entries).asInstanceOf[this.type]
+  override def cloneType: this.type = new ReservationStationIssue(cmd_t, rob_entries).asInstanceOf[this.type]
 }
 
 // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably
-class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], cmd_t: RoCCCommand) extends Module {
+class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], cmd_t: RoCCCommand) extends Module {
   import config._
 
   val block_rows = tileRows * meshRows
@@ -33,9 +37,9 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf
     val completed = Flipped(Valid(UInt(log2Up(rob_entries).W)))
 
     val issue = new Bundle {
-      val ld = new ROBIssue(cmd_t, rob_entries)
-      val st = new ROBIssue(cmd_t, rob_entries)
-      val ex = new ROBIssue(cmd_t, rob_entries)
+      val ld = new ReservationStationIssue(cmd_t, rob_entries)
+      val st = new ReservationStationIssue(cmd_t, rob_entries)
+      val ex = new ReservationStationIssue(cmd_t, rob_entries)
     }
 
     val ld_utilization = Output(UInt(log2Up(rob_entries+1).W))
@@ -97,8 +101,8 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf
     // Debugging signals
     val allocated_at = UInt(instructions_allocated.getWidth.W)
   }
-  val full_entries = Reg(Vec(rob_full_entries, UDValid(new Entry)))
-  val partial_entries = Reg(Vec(rob_partial_entries, UDValid(new Entry)))
+  val full_entries = Reg(Vec(reservation_station_full_entries, UDValid(new Entry)))
+  val partial_entries = Reg(Vec(reservation_station_partial_entries, UDValid(new Entry)))
 
   val entries = full_entries ++ partial_entries
 
@@ -122,9 +126,9 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf
 
   val new_entry = Wire(new Entry)
   new_entry := DontCare
-  val new_full_allocs = Wire(Vec(rob_full_entries, Bool()))
+  val new_full_allocs = Wire(Vec(reservation_station_full_entries, Bool()))
   new_full_allocs.foreach(_ := false.B)
-  val new_partial_allocs = Wire(Vec(rob_partial_entries, Bool()))
+  val new_partial_allocs = Wire(Vec(reservation_station_partial_entries, Bool()))
   new_partial_allocs.foreach(_ := false.B)
   val new_entry_oh = new_full_allocs ++ new_partial_allocs
   val alloc_fire = io.alloc.fire()
@@ -333,8 +337,8 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf
     new_entry.complete_on_issue := new_entry.is_config && new_entry.q =/= exq
 
     val is_full = PopCount(Seq(dst.valid, op1.valid, op2.valid)) > 1.U
-    val full_alloc_id = MuxCase((rob_full_entries-1).U, full_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U })
-    val partial_alloc_id = MuxCase((rob_partial_entries-1).U, partial_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U })
+    val full_alloc_id = MuxCase((reservation_station_full_entries-1).U, full_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U })
+    val partial_alloc_id = MuxCase((reservation_station_partial_entries-1).U, partial_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U })
 
     when (!is_full && !partial_entries(partial_alloc_id).valid) {
       io.alloc.ready := true.B
@@ -453,7 +457,7 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf
     dontTouch(e.bits.allocated_at)
   }
 
-  val cntr = Counter(10000000)
+  val cntr = Counter(2000000)
   when (cntr.inc()) {
     printf(p"Utilization: $utilization\n")
     printf(p"Utilization ld q (incomplete): $utilization_ld_q_unissued\n")
@@ -462,17 +466,33 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf
     printf(p"Utilization ld q: $utilization_ld_q\n")
     printf(p"Utilization st q: $utilization_st_q\n")
     printf(p"Utilization ex q: $utilization_ex_q\n")
+
+    if (use_firesim_simulation_counters) {
+      printf(SynthesizePrintf("Utilization: %d\n", utilization))
+      printf(SynthesizePrintf("Utilization ld q (incomplete): %d\n", utilization_ld_q_unissued))
+      printf(SynthesizePrintf("Utilization st q (incomplete): %d\n", utilization_st_q_unissued))
+      printf(SynthesizePrintf("Utilization ex q (incomplete): %d\n", utilization_ex_q_unissued))
+      printf(SynthesizePrintf("Utilization ld q: %d\n", utilization_ld_q))
+      printf(SynthesizePrintf("Utilization st q: %d\n", utilization_st_q))
+      printf(SynthesizePrintf("Utilization ex q: %d\n", utilization_ex_q))
+    }
+
     printf(p"Packed deps: $packed_deps\n")
   }
 
+  if (use_firesim_simulation_counters) {
+    PerfCounter(io.busy, "reservation_station_busy", "cycles where reservation station has entries")
+    PerfCounter(!io.alloc.ready, "reservation_station_full", "cycles where reservation station is full")
+  }
+
   when (reset.asBool()) {
     entries.foreach(_.valid := false.B)
   }
 
   CounterEventIO.init(io.counter)
-  io.counter.connectExternalCounter(CounterExternal.ROB_LD_COUNT, utilization_ld_q)
-  io.counter.connectExternalCounter(CounterExternal.ROB_ST_COUNT, utilization_st_q)
-  io.counter.connectExternalCounter(CounterExternal.ROB_EX_COUNT, utilization_ex_q)
-  io.counter.connectEventSignal(CounterEvent.ROB_ACTIVE_CYCLES, io.busy)
-  io.counter.connectEventSignal(CounterEvent.ROB_FULL_CYCLES, !io.alloc.ready)
+  io.counter.connectExternalCounter(CounterExternal.RESERVATION_STATION_LD_COUNT, utilization_ld_q)
+  io.counter.connectExternalCounter(CounterExternal.RESERVATION_STATION_ST_COUNT, utilization_st_q)
+  io.counter.connectExternalCounter(CounterExternal.RESERVATION_STATION_EX_COUNT, utilization_ex_q)
+  io.counter.connectEventSignal(CounterEvent.RESERVATION_STATION_ACTIVE_CYCLES, io.busy)
+  io.counter.connectEventSignal(CounterEvent.RESERVATION_STATION_FULL_CYCLES, !io.alloc.ready)
 }
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index f9b6293f..e3289b7f 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -160,10 +160,12 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
   val id_node = TLIdentityNode()
   val xbar_node = TLXbar()
 
-  val reader = LazyModule(new StreamReader(config, max_in_flight_reqs, dataBits, maxBytes, spad_w, acc_w, aligned_to,
-    sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, block_rows, use_tlb_register_filter))
-  val writer = LazyModule(new StreamWriter(max_in_flight_reqs, dataBits, maxBytes,
-    if (acc_read_full_width) acc_w else spad_w, aligned_to, inputType, block_cols, use_tlb_register_filter))
+  val reader = LazyModule(new StreamReader(config, max_in_flight_mem_reqs, dataBits, maxBytes, spad_w, acc_w, aligned_to,
+    sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, block_rows, use_tlb_register_filter,
+    use_firesim_simulation_counters))
+  val writer = LazyModule(new StreamWriter(max_in_flight_mem_reqs, dataBits, maxBytes,
+    if (acc_read_full_width) acc_w else spad_w, aligned_to, inputType, block_cols, use_tlb_register_filter,
+    use_firesim_simulation_counters))
 
   // TODO make a cross-bar vs two separate ports a config option
   // id_node :=* reader.node
@@ -191,7 +193,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       // Accumulator ports
       val acc = new Bundle {
         val read_req = Flipped(Vec(acc_banks, Decoupled(new AccumulatorReadReq(
-          acc_bank_entries, log2Up(accType.getWidth), acc_scale_args.multiplicand_t
+          acc_bank_entries, log2Up(accType.getWidth), acc_scale_t.asInstanceOf[V]
         ))))
         val read_resp = Vec(acc_banks, Decoupled(new AccumulatorScaleResp(
           Vec(meshColumns, Vec(tileColumns, inputType)),
@@ -216,9 +218,9 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     // Write scale queue is necessary to maintain in-order requests to accumulator scale unit
     // Writes from main SPAD just flow directly between scale_q and issue_q, while writes
     // From acc are ordered
-    val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), mem_pipeline))
-    val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), mem_pipeline+1, pipe=true))
-    val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), mem_pipeline+1, pipe=true)) // TODO can't this just be a normal queue?
+    val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay))
+    val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay+1, pipe=true))
+    val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), spad_read_delay+1, pipe=true)) // TODO can't this just be a normal queue?
 
     write_scale_q.io.enq.valid := false.B
     write_scale_q.io.enq.bits  := write_dispatch_q.bits
@@ -409,8 +411,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         ex_read_resp.valid := bio.read.resp.valid && !bio.read.resp.bits.fromDMA
         ex_read_resp.bits := bio.read.resp.bits
 
-        val dma_read_pipe = Pipeline(dma_read_resp, mem_pipeline)
-        val ex_read_pipe = Pipeline(ex_read_resp, mem_pipeline)
+        val dma_read_pipe = Pipeline(dma_read_resp, spad_read_delay)
+        val ex_read_pipe = Pipeline(ex_read_resp, spad_read_delay)
 
 
         bio.read.resp.ready := Mux(bio.read.resp.bits.fromDMA, dma_read_resp.ready, ex_read_resp.ready)
@@ -478,11 +480,14 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     val acc_scale_unit = Module(new AccumulatorScale(
       acc_row_t,
       spad_row_t,
-      acc_scale_args.multiplicand_t,
+      acc_scale_t.asInstanceOf[V],
       log2Up(accType.getWidth),
       acc_read_small_width,
       acc_read_full_width,
-      acc_scale_args
+      acc_scale_func,
+      acc_scale_num_units,
+      acc_scale_latency,
+      has_nonlinear_activations,
     ))
 
     acc_scale_unit.io.in.valid := false.B
@@ -511,8 +516,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     {
 
       val banks = Seq.fill(acc_banks) { Module(new AccumulatorMem(
-        acc_bank_entries, acc_row_t, acc_scale_args,
-        acc_singleported, num_acc_sub_banks
+        acc_bank_entries, acc_row_t, acc_scale_func, acc_scale_t.asInstanceOf[V],
+        acc_singleported, acc_sub_banks
       )) }
       val bank_ios = VecInit(banks.map(_.io))
 
@@ -610,13 +615,13 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
         val consecutive_write_block = RegInit(false.B)
         if (acc_singleported) {
-          val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(num_acc_sub_banks)).W))
+          val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(acc_sub_banks)).W))
           when (bio.write.fire() && bio.write.bits.acc &&
-            (bio.write.bits.addr(log2Ceil(num_acc_sub_banks)-1,0) === consecutive_write_sub_bank)) {
+            (bio.write.bits.addr(log2Ceil(acc_sub_banks)-1,0) === consecutive_write_sub_bank)) {
             consecutive_write_block := true.B
           } .elsewhen (bio.write.fire() && bio.write.bits.acc) {
             consecutive_write_block := false.B
-            consecutive_write_sub_bank := bio.write.bits.addr(log2Ceil(num_acc_sub_banks)-1,0)
+            consecutive_write_sub_bank := bio.write.bits.addr(log2Ceil(acc_sub_banks)-1,0)
           } .otherwise {
             consecutive_write_block := false.B
           }
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index 50efcfe5..28de72c3 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -7,6 +7,7 @@ import chisel3.experimental._
 import GemminiISA._
 import Util._
 import freechips.rocketchip.config.Parameters
+import midas.targetutils.PerfCounter
 
 // TODO this is almost a complete copy of LoadController. We should combine them into one class
 // TODO deal with errors when reading scratchpad responses
@@ -42,7 +43,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1
 
   val activation = Reg(UInt(GemminiISA.CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W))
-  val acc_scale = Reg(acc_scale_args.multiplicand_t)
+  val acc_scale = Reg(acc_scale_t)
 
   //val row_counter = RegInit(0.U(log2Ceil(block_rows).W))
   val row_counter = RegInit(0.U(12.W)) // TODO magic number
@@ -64,7 +65,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val wrow_counter = RegInit(0.U(pool_size.getWidth.W))
   val wcol_counter = RegInit(0.U(pool_size.getWidth.W))
 
-  val pooling_is_enabled = pool_stride =/= 0.U
+  val pooling_is_enabled = has_max_pool.B && pool_stride =/= 0.U
   val mvout_1d_enabled = pool_size =/= 0.U && !pooling_is_enabled //1-D move out enabled (no pooling)
 
   val orow = porow_counter * pool_stride +& wrow_counter - pool_upad // TODO get rid of this multiplication
@@ -118,7 +119,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
 
   val mvout_1d_rows = pool_orows * pool_ocols //for 1D mvout
   // Command tracker instantiation
-  val nCmds = (max_in_flight_reqs / block_rows) + 1
+  val nCmds = (max_in_flight_mem_reqs / block_rows) + 1
 
   val deps_t = new Bundle {
     val rob_id = UInt(log2Up(rob_entries).W)
@@ -200,7 +201,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
 
           activation := config_activation
           when (!config_acc_scale.asUInt().andR()) {
-            acc_scale := config_acc_scale.asTypeOf(acc_scale_args.multiplicand_t)
+            acc_scale := config_acc_scale.asTypeOf(acc_scale_t)
           }
 
           pool_size := config_pool_size
@@ -265,4 +266,9 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   io.counter.connectEventSignal(CounterEvent.STORE_POOLING_CYCLE, pooling_is_enabled)
   io.counter.connectEventSignal(CounterEvent.STORE_DMA_WAIT_CYCLE, control_state === waiting_for_dma_req_ready)
   io.counter.connectEventSignal(CounterEvent.STORE_SCRATCHPAD_WAIT_CYCLE, io.dma.req.valid && !io.dma.req.ready)
+
+  if (use_firesim_simulation_counters) {
+    PerfCounter(pooling_is_enabled, "pooling_cycles", "cycles during which store controller is max-pooling")
+    PerfCounter(io.dma.req.valid && !io.dma.req.ready, "st_dma_wait_cycle", "cycles during which store controller is stalling for the DMA to be ready")
+  }
 }
diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala
index 90a3394a..0bac0e5b 100644
--- a/src/main/scala/gemmini/TransposePreloadUnroller.scala
+++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala
@@ -5,6 +5,7 @@ import chisel3.util._
 import chisel3.experimental.ChiselEnum
 import chipsalliance.rocketchip.config.Parameters
 import Util._
+import midas.targetutils.PerfCounter
 
 class TransposePreloadUnroller[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V])
                                                                  (implicit p: Parameters) extends Module {
diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala
index efdd7636..e8581a26 100644
--- a/src/main/scala/gemmini/XactTracker.scala
+++ b/src/main/scala/gemmini/XactTracker.scala
@@ -3,6 +3,7 @@ package gemmini
 import chisel3._
 import chisel3.util._
 import gemmini.Util.UDValid
+import midas.targetutils.SynthesizePrintf
 
 class XactTrackerEntry[U <: Data](maxShift: Int, spadWidth: Int, accWidth: Int,
                                   spadRows: Int, accRows: Int, maxReqBytes: Int, mvin_scale_t_bits: Int,
@@ -53,11 +54,14 @@ class XactTrackerPeekIO[U <: Data](val nXacts: Int, val maxShift: Int, val spadW
     maxMatrices: the maximum number of rows from different matrices which can be packed into one request
  */
 class XactTracker[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidth: Int,
-                             spadRows: Int, accRows: Int, maxReqBytes: Int, mvin_scale_t_bits: Int, nCmds: Int) extends Module {
+                             spadRows: Int, accRows: Int, maxReqBytes: Int, mvin_scale_t_bits: Int, nCmds: Int,
+                             use_firesim_simulation_counters: Boolean) extends Module {
   val io = IO(new Bundle {
     val alloc = Flipped(new XactTrackerAllocIO(nXacts, maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds))
     val peek = new XactTrackerPeekIO(nXacts, maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds)
     val busy = Output(Bool())
+
+    val counter = new CounterEventIO()
   })
 
   val entries = Reg(Vec(nXacts, UDValid(new XactTrackerEntry(maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds))))
@@ -83,4 +87,23 @@ class XactTracker[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidt
   when (reset.asBool()) {
     entries.foreach(_.valid := false.B)
   }
+
+  // Performance counters
+  CounterEventIO.init(io.counter)
+
+  val total_latency = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
+  when (io.counter.external_reset) {
+    total_latency := 0.U
+  }.otherwise {
+    total_latency := total_latency + PopCount(entries.map(_.valid))
+  }
+
+  io.counter.connectExternalCounter(CounterExternal.RDMA_TOTAL_LATENCY, total_latency)
+
+  if (use_firesim_simulation_counters) {
+    val cntr = Counter(500000)
+    when(cntr.inc()) {
+      printf(SynthesizePrintf("RDMA total latency: %d\n", total_latency))
+    }
+  }
 }

From 73484616cd70f75d05d824efe2f6604b88459f5d Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Fri, 12 Nov 2021 14:31:23 -0800
Subject: [PATCH 05/11] Add Option to Use Two Separate TLBs for Read and Write
 DMAs (#135)

By default, there will be just one TLB shared by both the read and write DMAs
---
 src/main/scala/gemmini/Controller.scala     | 22 +++---
 src/main/scala/gemmini/FrontendTLB.scala    | 75 +++++++++++----------
 src/main/scala/gemmini/GemminiConfigs.scala |  1 +
 3 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index 08481a5c..f1de9486 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -24,7 +24,7 @@ class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiA
                                      (implicit p: Parameters)
   extends LazyRoCC (
     opcodes = config.opcodes,
-    nPTWPorts = 1) {
+    nPTWPorts = if (config.use_shared_tlb) 1 else 2) {
 
   Files.write(Paths.get(config.headerFilePath), config.generateHeader().getBytes(StandardCharsets.UTF_8))
   if (System.getenv("GEMMINI_ONLY_GENERATE_GEMMINI_H") == "1") {
@@ -62,15 +62,17 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   // TLB
   implicit val edge = outer.node.edges.out.head
-  val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters))
+  val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters, use_shared_tlb))
   (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2)
-  tlb.io.exp.flush_skip := false.B
-  tlb.io.exp.flush_retry := false.B
-  counters.io.event_io.collect(tlb.io.counter)
 
-  io.ptw.head <> tlb.io.ptw
+  tlb.io.exp.foreach(_.flush_skip := false.B)
+  tlb.io.exp.foreach(_.flush_retry := false.B)
+
+  io.ptw <> tlb.io.ptw
 
-  spad.module.io.flush := tlb.io.exp.flush()
+  counters.io.event_io.collect(tlb.io.counter)
+
+  spad.module.io.flush := tlb.io.exp.map(_.flush()).reduce(_ || _)
 
   /*
   //=========================================================================
@@ -311,7 +313,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   // Wire up global RoCC signals
   io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || reservation_station.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid
-  io.interrupt := tlb.io.exp.interrupt
+  io.interrupt := tlb.io.exp.map(_.interrupt).reduce(_ || _)
 
   reservation_station.io.solitary_preload := ex_controller.io.solitary_preload
 
@@ -356,8 +358,8 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
     when (is_flush) {
       val skip = unrolled_cmd.bits.rs1(0)
-      tlb.io.exp.flush_skip := skip
-      tlb.io.exp.flush_retry := !skip
+      tlb.io.exp.foreach(_.flush_skip := skip)
+      tlb.io.exp.foreach(_.flush_retry := !skip)
 
       unrolled_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB?
     }
diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index 50c393b5..269409fc 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -84,51 +84,66 @@ class FrontendTLBIO(implicit p: Parameters) extends CoreBundle {
   val resp = Flipped(new TLBResp)
 }
 
-class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean)
+class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean, use_shared_tlb: Boolean)
                  (implicit edge: TLEdgeOut, p: Parameters) extends CoreModule {
+
+  val num_tlbs = if (use_shared_tlb) 1 else nClients
+  val lgMaxSize = log2Ceil(coreDataBytes)
+
   val io = IO(new Bundle {
     val clients = Flipped(Vec(nClients, new FrontendTLBIO))
-    val ptw = new TLBPTWIO
-    val exp = new TLBExceptionIO
+    val ptw = Vec(num_tlbs, new TLBPTWIO)
+    val exp = Vec(num_tlbs, new TLBExceptionIO)
     val counter = new CounterEventIO()
   })
 
-  val lgMaxSize = log2Ceil(coreDataBytes)
-  val tlbArb = Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients))
-  val tlb = Module(new DecoupledTLB(entries, maxSize, use_firesim_simulation_counters))
-  tlb.io.req.valid := tlbArb.io.out.valid
-  tlb.io.req.bits := tlbArb.io.out.bits
-  tlbArb.io.out.ready := true.B
+  val tlbs = Seq.fill(num_tlbs)(Module(new DecoupledTLB(entries, maxSize, use_firesim_simulation_counters)))
 
-  io.ptw <> tlb.io.ptw
-  io.exp <> tlb.io.exp
+  io.ptw <> VecInit(tlbs.map(_.io.ptw))
+  io.exp <> VecInit(tlbs.map(_.io.exp))
+
+  val tlbArbOpt = if (use_shared_tlb) Some(Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients))) else None
+
+  if (use_shared_tlb) {
+    val tlbArb = tlbArbOpt.get
+    val tlb = tlbs.head
+    tlb.io.req.valid := tlbArb.io.out.valid
+    tlb.io.req.bits := tlbArb.io.out.bits
+    tlbArb.io.out.ready := true.B
+  }
 
-  io.clients.zip(tlbArb.io.in).foreach { case (client, req) =>
+  io.clients.zipWithIndex.foreach { case (client, i) =>
     val last_translated_valid = RegInit(false.B)
     val last_translated_vpn = RegInit(0.U(vaddrBits.W))
     val last_translated_ppn = RegInit(0.U(paddrBits.W))
 
-    val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits) === (last_translated_vpn >> pgIdxBits))
+    val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits).asUInt() === (last_translated_vpn >> pgIdxBits).asUInt())
     val l0_tlb_paddr = Cat(last_translated_ppn >> pgIdxBits, client.req.bits.tlb_req.vaddr(pgIdxBits-1,0))
 
-    when (req.fire() && !tlb.io.resp.miss) {
+    val tlb = if (use_shared_tlb) tlbs.head else tlbs(i)
+    val tlbReq = if (use_shared_tlb) tlbArbOpt.get.io.in(i).bits else tlb.io.req.bits
+    val tlbReqValid = if (use_shared_tlb) tlbArbOpt.get.io.in(i).valid else tlb.io.req.valid
+    val tlbReqFire = if (use_shared_tlb) tlbArbOpt.get.io.in(i).fire() else tlb.io.req.fire()
+
+    tlbReqValid := RegNext(client.req.valid && !l0_tlb_hit)
+    tlbReq := RegNext(client.req.bits)
+
+    when (tlbReqFire && !tlb.io.resp.miss) {
       last_translated_valid := true.B
-      last_translated_vpn := req.bits.tlb_req.vaddr
+      last_translated_vpn := tlbReq.tlb_req.vaddr
       last_translated_ppn := tlb.io.resp.paddr
     }
-    when (io.exp.flush()) {
+
+    when (tlb.io.exp.flush()) {
       last_translated_valid := false.B
     }
 
-    req.valid := RegNext(client.req.valid && !l0_tlb_hit)
-    req.bits := RegNext(client.req.bits)
-
-    when (!req.fire()) {
+    when (tlbReqFire) {
+      client.resp := tlb.io.resp
+    }.otherwise {
       client.resp := DontCare
       client.resp.paddr := RegNext(l0_tlb_paddr)
       client.resp.miss := !RegNext(l0_tlb_hit)
-    } .otherwise {
-      client.resp := tlb.io.resp
     }
 
     // If we're not using the TLB filter register, then we set this value to always be false
@@ -137,16 +152,8 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_fi
     }
   }
 
-  io.counter.collect(tlb.io.counter)
+  // TODO Return the sum of the TLB counters, rather than just the counters of the first TLB. This only matters if we're
+  // not using the shared TLB
+  tlbs.foreach(_.io.counter.external_reset := false.B)
+  io.counter.collect(tlbs.head.io.counter)
 }
-
-/*class TLBArb (nClients: Int, lgMaxSize: Int)(implicit p: Parameters) extends CoreModule {
-  val io = IO(new Bundle {
-    val in_req = Vec(nClients, Flipped(Decoupled(new TLBReq(lgMaxSize))))
-    val in_resp = Vec(nClients, Flipped(Valid(new TLBResp)))
-    val out_req = Decoupled(new TLBReq(lgMaxSize))
-    val out_resp = Valid(new TLBResp)
-  })
-
-  val priority = Reg(UInt(log2Up(nClients).W))
-}*/
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 45c481ce..041bfcd0 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -73,6 +73,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              ex_write_to_acc: Boolean = true,
 
                                                                              hardcode_d_to_garbage_addr: Boolean = false,
+                                                                             use_shared_tlb: Boolean = true,
 
                                                                              mesh_output_delay: Int = 1,
 

From b9ff1540c9043b7600be37cded897f4bb0a81897 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 30 Nov 2021 21:32:38 -0800
Subject: [PATCH 06/11] Chip optimizations (#83)

* Support single-porting accumulator through the use of accumulator "sub-banks"
* Support clock-gating Gemmini modules
* Support sharing SPAD/ACC between Int8 and FP gemminis
* Reduce bitwidths of loop unroller multipliers and adders
* Fix error where small portion of scratchpad was unusable when double-buffering in the loop unrollers

When single-porting the accumulator banks, input-dilated convs will sometimes fail because they keep writing to the same accumulator banks. A different write pattern will have to be found eventually for those cases, but that's outside the scope of this PR.
---
 src/main/scala/gemmini/AccumulatorMem.scala   | 253 ++++++++++++------
 src/main/scala/gemmini/Configs.scala          |  83 +++++-
 src/main/scala/gemmini/ConfigsFP.scala        |   5 +-
 src/main/scala/gemmini/Controller.scala       |  37 ++-
 src/main/scala/gemmini/DSEConfigs.scala       |   5 +-
 src/main/scala/gemmini/GemminiConfigs.scala   |  12 +-
 src/main/scala/gemmini/GemminiISA.scala       |   2 +
 src/main/scala/gemmini/LocalAddr.scala        |  10 +
 src/main/scala/gemmini/LoopConv.scala         |  27 +-
 src/main/scala/gemmini/LoopMatmul.scala       |  25 +-
 src/main/scala/gemmini/Scratchpad.scala       |  89 ++++--
 src/main/scala/gemmini/SharedExtMem.scala     |  80 ++++++
 .../gemmini/VectorScalarMultiplier.scala      |   4 +-
 13 files changed, 488 insertions(+), 144 deletions(-)
 create mode 100644 src/main/scala/gemmini/SharedExtMem.scala

diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala
index 89a39182..8f3fbaf5 100644
--- a/src/main/scala/gemmini/AccumulatorMem.scala
+++ b/src/main/scala/gemmini/AccumulatorMem.scala
@@ -44,17 +44,59 @@ class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends
   override def cloneType: this.type = new AccumulatorWriteReq(n, t).asInstanceOf[this.type]
 }
 
-class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U) extends Bundle {
+
+class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U,
+  acc_sub_banks: Int, use_shared_ext_mem: Boolean
+) extends Bundle {
   val read = Flipped(new AccumulatorReadIO(n, log2Ceil(t.head.head.getWidth), t, scale_t))
-  // val write = Flipped(new AccumulatorWriteIO(n, t))
   val write = Flipped(Decoupled(new AccumulatorWriteReq(n, t)))
 
-  override def cloneType: this.type = new AccumulatorMemIO(n, t, scale_t).asInstanceOf[this.type]
+  val ext_mem = if (use_shared_ext_mem) Some(Vec(acc_sub_banks, new ExtMemIO)) else None
+
+  val adder = new Bundle {
+    val valid = Output(Bool())
+    val op1 = Output(t.cloneType)
+    val op2 = Output(t.cloneType)
+    val sum = Input(t.cloneType)
+  }
+
+  override def cloneType: this.type = new AccumulatorMemIO(n, t, scale_t, acc_sub_banks, use_shared_ext_mem).asInstanceOf[this.type]
+}
+
+class AccPipe[T <: Data : Arithmetic](latency: Int, t: T)(implicit ev: Arithmetic[T]) extends Module {
+  val io = IO(new Bundle {
+    val op1 = Input(t.cloneType)
+    val op2 = Input(t.cloneType)
+    val sum = Output(t.cloneType)
+  })
+  import ev._
+  io.sum := ShiftRegister(io.op1 + io.op2, latency)
+}
+
+class AccPipeShared[T <: Data : Arithmetic](latency: Int, t: Vec[Vec[T]], banks: Int) extends Module {
+  val io = IO(new Bundle {
+    val in_sel = Input(Vec(banks, Bool()))
+    val ina = Input(Vec(banks, t.cloneType))
+    val inb = Input(Vec(banks, t.cloneType))
+    val out = Output(t.cloneType)
+  })
+  val ina = Mux1H(io.in_sel, io.ina)
+  val inb = Mux1H(io.in_sel, io.inb)
+  io.out := VecInit((ina zip inb).map { case (rv, wv) =>
+    VecInit((rv zip wv).map { case (re, we) =>
+      val m = Module(new AccPipe(latency, t.head.head.cloneType))
+      m.io.op1 := re
+      m.io.op2 := we
+      m.io.sum
+    })
+  })
 }
 
 class AccumulatorMem[T <: Data, U <: Data](
-                                            n: Int, t: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U,
-                                            acc_singleported: Boolean, acc_sub_banks: Int
+  n: Int, t: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U,
+  acc_singleported: Boolean, acc_sub_banks: Int,
+  use_shared_ext_mem: Boolean,
+  acc_latency: Int, acc_type: T
 )
   (implicit ev: Arithmetic[T]) extends Module {
   // TODO Do writes in this module work with matrices of size 2? If we try to read from an address right after writing
@@ -69,54 +111,91 @@ class AccumulatorMem[T <: Data, U <: Data](
   import ev._
 
   // TODO unify this with TwoPortSyncMemIO
-  val io = IO(new AccumulatorMemIO(n, t, scale_t))
-
-
-  // For any write operation, we spend 2 cycles reading the existing address out, buffering it in a register, and then
-  // accumulating on top of it (if necessary)
-  val wdata_buf = ShiftRegister(io.write.bits.data, 2)
-  val waddr_buf = ShiftRegister(io.write.bits.addr, 2)
-  val acc_buf = ShiftRegister(io.write.bits.acc, 2)
-  val mask_buf = ShiftRegister(io.write.bits.mask, 2)
-  val w_buf_valid = ShiftRegister(io.write.fire(), 2)
-  val acc_rdata = Wire(t)
-  acc_rdata := DontCare
-  val read_rdata = Wire(t)
-  read_rdata := DontCare
+  val io = IO(new AccumulatorMemIO(n, t, scale_t, acc_sub_banks, use_shared_ext_mem))
+
+  require (acc_latency >= 2)
+
+  val pipelined_writes = Reg(Vec(acc_latency, Valid(new AccumulatorWriteReq(n, t))))
+  val oldest_pipelined_write = pipelined_writes(acc_latency-1)
+  pipelined_writes(0).valid := io.write.fire()
+  pipelined_writes(0).bits  := io.write.bits
+  for (i <- 1 until acc_latency) {
+    pipelined_writes(i) := pipelined_writes(i-1)
+  }
+
+  val rdata_for_adder = Wire(t)
+  rdata_for_adder := DontCare
+  val rdata_for_read_resp = Wire(t)
+  rdata_for_read_resp := DontCare
+
+  val adder_sum = io.adder.sum
+  io.adder.valid := pipelined_writes(0).valid && pipelined_writes(0).bits.acc
+  io.adder.op1 := rdata_for_adder
+  io.adder.op2 := pipelined_writes(0).bits.data
+
   val block_read_req = WireInit(false.B)
-  val w_sum = VecInit((RegNext(acc_rdata) zip wdata_buf).map { case (rv, wv) =>
-    VecInit((rv zip wv).map(t => t._1 + t._2))
-  })
+  val block_write_req = WireInit(false.B)
+
+  val mask_len = t.getWidth / 8
+  val mask_elem = UInt((t.getWidth / mask_len).W)
 
   if (!acc_singleported) {
-    val mem = TwoPortSyncMem(n, t, t.getWidth / 8) // TODO We assume byte-alignment here. Use aligned_to instead
-    mem.io.waddr := waddr_buf
-    mem.io.wen := w_buf_valid
-    mem.io.wdata := Mux(acc_buf, w_sum, wdata_buf)
-    mem.io.mask := mask_buf
-    acc_rdata := mem.io.rdata
-    read_rdata := mem.io.rdata
+    require(!use_shared_ext_mem)
+    val mem = TwoPortSyncMem(n, t, mask_len) // TODO We assume byte-alignment here. Use aligned_to instead
+    mem.io.waddr := oldest_pipelined_write.bits.addr
+    mem.io.wen := oldest_pipelined_write.valid
+    mem.io.wdata := Mux(oldest_pipelined_write.bits.acc, adder_sum, oldest_pipelined_write.bits.data)
+    mem.io.mask := oldest_pipelined_write.bits.mask
+    rdata_for_adder := mem.io.rdata
+    rdata_for_read_resp := mem.io.rdata
     mem.io.raddr := Mux(io.write.fire() && io.write.bits.acc, io.write.bits.addr, io.read.req.bits.addr)
     mem.io.ren := io.read.req.fire() || (io.write.fire() && io.write.bits.acc)
   } else {
-    val mask_len = t.getWidth / 8
-    val mask_elem = UInt((t.getWidth / mask_len).W)
-    val reads = Wire(Vec(2, Decoupled(UInt())))
-    reads(0).valid := io.write.valid && io.write.bits.acc
-    reads(0).bits  := io.write.bits.addr
-    reads(0).ready := true.B
-    reads(1).valid := io.read.req.valid
-    reads(1).bits  := io.read.req.bits.addr
-    reads(1).ready := true.B
-    block_read_req := !reads(1).ready
+    val rmw_req = Wire(Decoupled(UInt()))
+    rmw_req.valid := io.write.valid && io.write.bits.acc
+    rmw_req.bits := io.write.bits.addr
+    rmw_req.ready := true.B
+
+    block_write_req := !rmw_req.ready
+
+    val only_read_req = Wire(Decoupled(UInt()))
+    only_read_req.valid := io.read.req.valid
+    only_read_req.bits := io.read.req.bits.addr
+    only_read_req.ready := true.B
+
+    block_read_req := !only_read_req.ready
+
     for (i <- 0 until acc_sub_banks) {
       def isThisBank(addr: UInt) = addr(log2Ceil(acc_sub_banks)-1,0) === i.U
-      def getBankIdx(addr: UInt): UInt = (addr >> log2Ceil(acc_sub_banks)).asUInt()
-      val mem = SyncReadMem(n / acc_sub_banks, Vec(mask_len, mask_elem))
+      def getBankIdx(addr: UInt) = addr >> log2Ceil(acc_sub_banks)
+      val (read, write) = if (use_shared_ext_mem) {
+        def read(addr: UInt, ren: Bool): Data = {
+          io.ext_mem.get(i).read_en := ren
+          io.ext_mem.get(i).read_addr := addr
+          io.ext_mem.get(i).read_data
+        }
+        io.ext_mem.get(i).write_en := false.B
+        io.ext_mem.get(i).write_addr := DontCare
+        io.ext_mem.get(i).write_data := DontCare
+        io.ext_mem.get(i).write_mask := DontCare
+        def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = {
+          io.ext_mem.get(i).write_en := true.B
+          io.ext_mem.get(i).write_addr := addr
+          io.ext_mem.get(i).write_data := wdata.asUInt
+          io.ext_mem.get(i).write_mask := wmask.asUInt
+        }
+        (read _, write _)
+      } else {
+        val mem = SyncReadMem(n / acc_sub_banks, Vec(mask_len, mask_elem))
+        def read(addr: UInt, ren: Bool): Data = mem.read(addr, ren)
+        def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = mem.write(addr, wdata, wmask)
+        (read _, write _)
+      }
 
       val ren = WireInit(false.B)
-      val raddr = WireInit(getBankIdx(reads(0).bits))
+      val raddr = WireInit(getBankIdx(rmw_req.bits))
       val nEntries = 3
+
       // Writes coming 2 cycles after read leads to bad bank behavior
       // Add another buffer here
       class W_Q_Entry[T <: Data](mask_len: Int, mask_elem: T) extends Bundle {
@@ -126,25 +205,32 @@ class AccumulatorMem[T <: Data, U <: Data](
         val addr = UInt(log2Ceil(n/acc_sub_banks).W)
         override def cloneType: this.type = new W_Q_Entry(mask_len, mask_elem).asInstanceOf[this.type]
       }
+
       val w_q = Reg(Vec(nEntries, new W_Q_Entry(mask_len, mask_elem)))
       for (e <- w_q) {
         when (e.valid) {
           assert(!(
-            io.write.valid && io.write.bits.acc &&
+            io.write.fire() && io.write.bits.acc &&
             isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr &&
             ((io.write.bits.mask.asUInt & e.mask.asUInt) =/= 0.U)
-          ))
+          ), "you cannot accumulate to an AccumulatorMem address until previous writes to that address have completed")
+
+          when (io.write.bits.acc && isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr) {
+            rmw_req.ready := false.B
+          }
 
-          when (io.read.req.valid && isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) {
-            reads(1).ready := false.B
+          when (isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) {
+            only_read_req.ready := false.B
           }
         }
       }
+
       val w_q_head = RegInit(1.U(nEntries.W))
       val w_q_tail = RegInit(1.U(nEntries.W))
-      when (reset.asBool) {
-        w_q.foreach(_.valid := false.B)
-      }
+
+      val w_q_full = (w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)
+      val w_q_empty = !(w_q_head.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)
+
       val wen = WireInit(false.B)
       val wdata = Mux1H(w_q_head.asBools, w_q.map(_.data))
       val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask))
@@ -158,49 +244,61 @@ class AccumulatorMem[T <: Data, U <: Data](
         }
       }
 
-      when (w_buf_valid && isThisBank(waddr_buf)) {
-        assert(!((w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)))
+      val w_q_push = oldest_pipelined_write.valid && isThisBank(oldest_pipelined_write.bits.addr)
+
+      when (w_q_push) {
+        assert(!w_q_full || wen, "we ran out of acc-sub-bank write q entries")
+
         w_q_tail := (w_q_tail << 1).asUInt() | w_q_tail(nEntries-1)
         for (i <- 0 until nEntries) {
           when (w_q_tail(i)) {
             w_q(i).valid := true.B
-            w_q(i).data  := Mux(acc_buf, w_sum, wdata_buf).asTypeOf(Vec(mask_len, mask_elem))
-            w_q(i).mask  := mask_buf
-            w_q(i).addr  := getBankIdx(waddr_buf)
+            w_q(i).data  := Mux(oldest_pipelined_write.bits.acc, adder_sum, oldest_pipelined_write.bits.data).asTypeOf(Vec(mask_len, mask_elem))
+            w_q(i).mask  := oldest_pipelined_write.bits.mask
+            w_q(i).addr  := getBankIdx(oldest_pipelined_write.bits.addr)
           }
         }
-
       }
-      val bank_rdata = mem.read(raddr, ren && !wen).asTypeOf(t)
-      when (RegNext(ren && reads(0).valid && isThisBank(reads(0).bits))) {
-        acc_rdata := bank_rdata
+
+      val bank_rdata = read(raddr, ren && !wen).asTypeOf(t)
+      when (RegNext(ren && rmw_req.valid && isThisBank(rmw_req.bits))) {
+        rdata_for_adder := bank_rdata
       } .elsewhen (RegNext(ren)) {
-        read_rdata := bank_rdata
+        rdata_for_read_resp := bank_rdata
       }
+
       when (wen) {
-        mem.write(waddr, wdata, wmask)
+        write(waddr, wdata, wmask)
       }
+
       // Three requestors, 1 slot
-      // Priority is incoming reads for RMW > writes from RMW > incoming reads
-      when (reads(0).valid && isThisBank(reads(0).bits)) {
+      // Priority is (in descending order):
+      //   1. incoming reads for RMW
+      //   2. writes from RMW
+      //   3. incoming reads
+      when (rmw_req.fire() && isThisBank(rmw_req.bits)) {
         ren := true.B
-        when (isThisBank(reads(1).bits)) {
-          reads(1).ready := false.B
+        when (isThisBank(only_read_req.bits)) {
+          only_read_req.ready := false.B
         }
-      } .elsewhen ((w_q_head.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)) {
+      } .elsewhen (!w_q_empty) {
         wen := true.B
-        when (isThisBank(reads(1).bits)) {
-          reads(1).ready := false.B
+        when (isThisBank(only_read_req.bits)) {
+          only_read_req.ready := false.B
         }
       } .otherwise {
-        ren := isThisBank(reads(1).bits)
-        raddr := getBankIdx(reads(1).bits)
+        ren := isThisBank(only_read_req.bits) && only_read_req.fire()
+        raddr := getBankIdx(only_read_req.bits)
+      }
+
+      when (reset.asBool) {
+        w_q.foreach(_.valid := false.B)
       }
     }
   }
 
   val q = Module(new Queue(new AccumulatorReadResp(t, scale_t, log2Ceil(t.head.head.getWidth)),  1, true, true))
-  q.io.enq.bits.data := read_rdata
+  q.io.enq.bits.data := rdata_for_read_resp
   q.io.enq.bits.scale := RegNext(io.read.req.bits.scale)
   q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift)
   q.io.enq.bits.act := RegNext(io.read.req.bits.act)
@@ -222,17 +320,18 @@ class AccumulatorMem[T <: Data, U <: Data](
   val q_will_be_empty = (q.io.count +& q.io.enq.fire()) - q.io.deq.fire() === 0.U
   io.read.req.ready := q_will_be_empty && (
       // Make sure we aren't accumulating, which would take over both ports
-      !(io.write.fire() && io.write.bits.acc) &&
-      // Make sure we aren't reading something that is still being written
-      !(RegNext(io.write.fire()) && RegNext(io.write.bits.addr) === io.read.req.bits.addr) &&
-      !(w_buf_valid && waddr_buf === io.read.req.bits.addr) &&
+      !(io.write.valid && io.write.bits.acc) &&
+      !pipelined_writes.map(r => r.valid && r.bits.addr === io.read.req.bits.addr).reduce(_||_)  &&
       !block_read_req
   )
 
-  io.write.ready := !io.write.bits.acc || (!(io.write.bits.addr === waddr_buf && w_buf_valid) &&
-    !(io.write.bits.addr === RegNext(io.write.bits.addr) && RegNext(io.write.fire())))
+  io.write.ready := !block_write_req &&
+    !pipelined_writes.map(r => r.valid && r.bits.addr === io.write.bits.addr && io.write.bits.acc).reduce(_||_)
+
+  when (reset.asBool()) {
+    pipelined_writes.foreach(_.valid := false.B)
+  }
 
   // assert(!(io.read.req.valid && io.write.en && io.write.acc), "reading and accumulating simultaneously is not supported")
   assert(!(io.read.req.fire() && io.write.fire() && io.read.req.bits.addr === io.write.bits.addr), "reading from and writing to same address is not supported")
-  assert(!(io.read.req.fire() && w_buf_valid && waddr_buf === io.read.req.bits.addr), "reading from an address immediately after writing to it is not supported")
 }
diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 2e172adf..7ceffcfe 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -5,7 +5,12 @@ import chisel3._
 import freechips.rocketchip.config.{Config, Parameters}
 import freechips.rocketchip.diplomacy.LazyModule
 import freechips.rocketchip.subsystem._
-import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet}
+import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet, XLen}
+import freechips.rocketchip.rocket._
+import freechips.rocketchip.tile._
+import freechips.rocketchip.system._
+import freechips.rocketchip.diplomacy._
+
 import gemmini.Arithmetic.SIntArithmetic
 import hardfloat._
 
@@ -162,8 +167,10 @@ object GemminiConfigs {
     acc_scale_args=Some(defaultConfig.acc_scale_args.get.copy(latency=4)),
     acc_singleported=true,
     acc_sub_banks=2,
+    mesh_output_delay = 2,
     ex_read_from_acc=false,
-    ex_write_to_spad=false
+    ex_write_to_spad=false,
+    hardcode_d_to_garbage_addr = true
   )
 
   val largeChipConfig = chipConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64),
@@ -190,3 +197,75 @@ class DefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   )
   case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
+
+// This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing
+// the same scratchpad.
+class DualGemminiConfig extends Config((site, here, up) => {
+  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
+  case BuildRoCC => {
+    var int_gemmini: Gemmini[_,_,_] = null
+    var fp_gemmini: Gemmini[_,_,_] = null
+    val int_fn = (p: Parameters) => {
+      implicit val q = p
+      int_gemmini = LazyModule(new Gemmini(GemminiConfigs.chipConfig.copy(
+        opcodes = OpcodeSet.custom3,
+        use_shared_ext_mem = true,
+        clock_gate = true
+      )))
+      int_gemmini
+    }
+    val fp_fn = (p: Parameters) => {
+      implicit val q = p
+      fp_gemmini = LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig.copy(
+        opcodes = OpcodeSet.custom2,
+        sp_capacity=CapacityInKilobytes(64), acc_capacity=CapacityInKilobytes(32),
+        meshColumns = 8, meshRows = 8,
+        acc_singleported = true, acc_banks = 2, acc_sub_banks = 2,
+        use_shared_ext_mem = true,
+        ex_read_from_acc=false,
+        ex_write_to_spad=false,
+        hardcode_d_to_garbage_addr = true,
+        headerFileName = "gemmini_params_bf16.h",
+	acc_latency = 3,
+        dataflow = Dataflow.WS,
+        mesh_output_delay = 3,
+        clock_gate = true
+      )))
+      InModuleBody {
+        require(int_gemmini.config.sp_banks == fp_gemmini.config.sp_banks)
+        require(int_gemmini.config.acc_banks == fp_gemmini.config.acc_banks)
+        require(int_gemmini.config.acc_sub_banks == fp_gemmini.config.acc_sub_banks)
+        require(int_gemmini.config.sp_singleported && fp_gemmini.config.sp_singleported)
+        require(int_gemmini.config.acc_singleported && fp_gemmini.config.acc_singleported)
+
+        require(int_gemmini.config.sp_bank_entries == fp_gemmini.config.sp_bank_entries)
+        require(int_gemmini.spad.module.spad_mems(0).mask_len == fp_gemmini.spad.module.spad_mems(0).mask_len)
+        require(int_gemmini.spad.module.spad_mems(0).mask_elem.getWidth == fp_gemmini.spad.module.spad_mems(0).mask_elem.getWidth)
+
+        println(int_gemmini.config.acc_bank_entries, fp_gemmini.config.acc_bank_entries)
+        println(int_gemmini.spad.module.acc_mems(0).mask_len, fp_gemmini.spad.module.acc_mems(0).mask_len)
+        println(int_gemmini.spad.module.acc_mems(0).mask_elem.getWidth, fp_gemmini.spad.module.acc_mems(0).mask_elem.getWidth)
+
+        require(int_gemmini.config.acc_bank_entries == fp_gemmini.config.acc_bank_entries / 2)
+        require(int_gemmini.config.acc_sub_banks == fp_gemmini.config.acc_sub_banks)
+        require(int_gemmini.spad.module.acc_mems(0).mask_len == fp_gemmini.spad.module.acc_mems(0).mask_len * 2)
+        require(int_gemmini.spad.module.acc_mems(0).mask_elem.getWidth == fp_gemmini.spad.module.acc_mems(0).mask_elem.getWidth)
+
+        val spad_mask_len = int_gemmini.spad.module.spad_mems(0).mask_len
+        val spad_data_len = int_gemmini.spad.module.spad_mems(0).mask_elem.getWidth
+        val acc_mask_len = int_gemmini.spad.module.acc_mems(0).mask_len
+        val acc_data_len = int_gemmini.spad.module.acc_mems(0).mask_elem.getWidth
+
+        val shared_mem = Module(new SharedExtMem(
+          int_gemmini.config.sp_banks, int_gemmini.config.acc_banks, int_gemmini.config.acc_sub_banks,
+          int_gemmini.config.sp_bank_entries, spad_mask_len, spad_data_len,
+          int_gemmini.config.acc_bank_entries / int_gemmini.config.acc_sub_banks, acc_mask_len, acc_data_len
+        ))
+        shared_mem.io.in(0) <> int_gemmini.module.ext_mem_io.get
+        shared_mem.io.in(1) <> fp_gemmini.module.ext_mem_io.get
+      }
+      fp_gemmini
+    }
+    up(BuildRoCC) ++ Seq(int_fn, fp_fn)
+  }
+})
diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
index a54c2853..91a4dbd2 100644
--- a/src/main/scala/gemmini/ConfigsFP.scala
+++ b/src/main/scala/gemmini/ConfigsFP.scala
@@ -30,6 +30,7 @@ object GemminiFPConfigs {
     sp_banks = 4,
     sp_singleported = true,
     acc_banks = 1,
+    acc_latency = 2,
     acc_singleported = false,
     acc_sub_banks = -1,
     sp_capacity = CapacityInKilobytes(256),
@@ -45,7 +46,7 @@ object GemminiFPConfigs {
     use_tlb_register_filter = true,
     max_in_flight_mem_reqs = 16,
     use_dedicated_tl_port = false,
-
+    use_shared_ext_mem = false,
     inputType = Float(8, 24),
     spatialArrayOutputType = Float(8, 24),
     accType = Float(8, 24),
@@ -84,7 +85,7 @@ object GemminiFPConfigs {
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
-  
+ 
   //FP16 Half Precision Configuration
   val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24),
                                                pe_latency = 2,
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index f1de9486..3e74af93 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -9,6 +9,7 @@ import chisel3.util._
 import freechips.rocketchip.config._
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tile._
+import freechips.rocketchip.util.ClockGate
 import freechips.rocketchip.tilelink.TLIdentityNode
 import GemminiISA._
 import Util._
@@ -49,6 +50,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   import outer.config._
   import outer.spad
 
+  val ext_mem_io = if (use_shared_ext_mem) Some(IO(new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks))) else None
+  ext_mem_io.foreach(_ <> outer.spad.module.io.ext_mem.get)
+
   val tagWidth = 32
 
   // Counters
@@ -74,6 +78,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   spad.module.io.flush := tlb.io.exp.map(_.flush()).reduce(_ || _)
 
+  val clock_en_reg = RegInit(true.B)
+  val gated_clock = if (clock_gate) ClockGate(clock, clock_en_reg, "gemmini_clock_gate") else clock
+  outer.spad.module.clock := gated_clock
+
   /*
   //=========================================================================
   // Frontends: Incoming commands and ROB
@@ -113,10 +121,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   val unrolled_cmd = LoopUnroller(raw_risc_cmd, outer.config.meshRows * outer.config.tileRows)
   */
 
-  // Incoming commands and reservation station
-  val reservation_station = Module(new ReservationStation(outer.config, new RoCCCommand))
+  val reservation_station = withClock (gated_clock) { Module(new ReservationStation(outer.config, new RoCCCommand)) }
   counters.io.event_io.collect(reservation_station.io.counter)
 
+  when (io.cmd.valid && io.cmd.bits.inst.funct === CLKGATE_EN && !io.busy) {
+    clock_en_reg := io.cmd.bits.rs1(0)
+  }
   val raw_cmd = Queue(io.cmd)
 
   val max_lds = reservation_station_partial_entries
@@ -124,7 +134,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   val max_sts = reservation_station_partial_entries / 2
 
   // TODO replace 4,12,2 with parameters based on ROB size
-  val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
+  val (conv_cmd, loop_conv_unroller_busy) = withClock (gated_clock) { LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
     inputType.getWidth, accType.getWidth, dma_maxbytes,
     new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
@@ -132,14 +142,14 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
     new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
     new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
-    has_training_convs, has_max_pool)
+    has_training_convs, has_max_pool) }
 
-  val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
+  val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
     inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
     new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
-    new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t))
+    new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)) }
 
   val unrolled_cmd = Queue(loop_cmd)
   unrolled_cmd.ready := false.B
@@ -167,9 +177,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   //=========================================================================
   // Controllers
   //=========================================================================
-  val load_controller = Module(new LoadController(outer.config, coreMaxAddrBits, local_addr_t))
-  val store_controller = Module(new StoreController(outer.config, coreMaxAddrBits, local_addr_t))
-  val ex_controller = Module(new ExecuteController(xLen, tagWidth, outer.config))
+  val load_controller = withClock (gated_clock) { Module(new LoadController(outer.config, coreMaxAddrBits, local_addr_t)) }
+  val store_controller = withClock (gated_clock) { Module(new StoreController(outer.config, coreMaxAddrBits, local_addr_t)) }
+  val ex_controller = withClock (gated_clock) { Module(new ExecuteController(xLen, tagWidth, outer.config)) }
 
   counters.io.event_io.collect(load_controller.io.counter)
   counters.io.event_io.collect(store_controller.io.counter)
@@ -240,7 +250,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   ex_controller.io.acc.write <> spad.module.io.acc.write
 
   // Im2Col unit
-  val im2col = Module(new Im2Col(outer.config))
+  val im2col = withClock (gated_clock) { Module(new Im2Col(outer.config)) }
 
   // Wire up Im2col
   counters.io.event_io.collect(im2col.io.counter)
@@ -313,6 +323,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   // Wire up global RoCC signals
   io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || reservation_station.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid
+
   io.interrupt := tlb.io.exp.map(_.interrupt).reduce(_ || _)
 
   reservation_station.io.solitary_preload := ex_controller.io.solitary_preload
@@ -349,6 +360,8 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
     val is_flush = risc_funct === FLUSH_CMD
     val is_counter_op = risc_funct === COUNTER_OP
+    val is_clock_gate_en = risc_funct === CLKGATE_EN
+
     /*
     val is_load = (funct === LOAD_CMD) || (funct === CONFIG_CMD && config_cmd_type === CONFIG_LOAD)
     val is_store = (funct === STORE_CMD) || (funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE)
@@ -369,6 +382,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
       counters.io.in <> unrolled_cmd
     }
 
+    .elsewhen (is_clock_gate_en) {
+      unrolled_cmd.ready := true.B
+    }
+
     .otherwise {
       reservation_station.io.alloc.valid := true.B
 
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index 37fc70f4..0d4681b5 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -27,7 +27,7 @@ object DSEBaseConfig {
     sp_banks = 4, // TODO support one-bank designs
     acc_banks = 1,
     acc_singleported = false,
-    acc_sub_banks = -1,
+    acc_latency = 2,
     sp_capacity = CapacityInKilobytes(64),
     sp_singleported = false,
     shifter_banks = 1, // TODO add separate parameters for left and up shifter banks
@@ -59,6 +59,7 @@ object DSEBaseConfig {
     acc_read_full_width = true,
     acc_read_small_width = true,
     use_dedicated_tl_port = false,
+    use_shared_ext_mem = true,
     pe_latency = 0,
 
     ex_read_from_spad = true,
@@ -79,6 +80,8 @@ object DSEBaseConfig {
     has_nonlinear_activations = true,
 
     num_counter = 8,
+
+    clock_gate = false,
   )
 }
 
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 041bfcd0..beb46c71 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -15,12 +15,12 @@ case class ScaleArguments[T <: Data, U <: Data](scale_func: (T, U) => T, latency
                                                 identity: String="0", c_str: String="ROUNDING_RIGHT_SHIFT(x, scale)")
 
 case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
+                                                                             opcodes: OpcodeSet = OpcodeSet.custom3,
+
                                                                              inputType: T,
                                                                              spatialArrayOutputType: T,
                                                                              accType: T,
 
-                                                                             opcodes: OpcodeSet = OpcodeSet.custom3,
-
                                                                              dataflow: Dataflow.Value = Dataflow.BOTH,
 
                                                                              tileRows: Int = 1,
@@ -44,6 +44,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              acc_singleported: Boolean = false,
                                                                              acc_sub_banks: Int = -1,
                                                                              acc_capacity: GemminiMemCapacity = CapacityInKilobytes(64),
+                                                                             acc_latency: Int = 2,
 
                                                                              dma_maxbytes: Int = 64, // TODO get this from cacheblockbytes
                                                                              dma_buswidth: Int = 128, // TODO get this from SystemBusKey
@@ -85,6 +86,9 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
 
                                                                              use_firesim_simulation_counters: Boolean = false,
 
+                                                                             use_shared_ext_mem: Boolean = false,
+                                                                             clock_gate: Boolean = false,
+
                                                                              headerFileName: String = "gemmini_params.h"
                                                        ) {
   val sp_width = meshColumns * tileColumns * inputType.getWidth
@@ -261,7 +265,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
           (dt.expWidth, dt.sigWidth) match {
             case (8, 24) => (scala.Float.MinValue.toString, scala.Float.MaxValue.toString)
             case (11, 53) => (scala.Double.MinValue.toString, scala.Double.MaxValue.toString)
-            case _ => (((Range(-1,-(dt.sigWidth),-1).map(-Math.pow(2, _)).foldLeft(-1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString, ((Range(-1,-(dt.sigWidth),-1).map(Math.pow(2, _)).foldLeft(1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString)
+            case (e, s) => (((Range(-1,-(s),-1).map(-Math.pow(2, _)).foldLeft(-1.0)(_ + _)) * Math.pow(2, Math.pow(2, e - 1) - 1)).toString, ((Range(-1,-(s),-1).map(Math.pow(2, _)).foldLeft(1.0)(_ + _)) * Math.pow(2, Math.pow(2, e - 1) - 1)).toString)
           }
         case dt => ("0", BigInt(2).pow(dt.getWidth).-(1).toString)
         // case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown")
@@ -275,7 +279,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
           (dt.expWidth, dt.sigWidth) match {
             case (8, 24) => "float"
             case (11, 53) => "double"
-            case _ => s"uint" + (Math.pow(2, Math.ceil(Math.log(dt.expWidth + dt.sigWidth)/Math.log(2.0)))).toInt.toString + s"_t"
+            case (e, s) => s"uint" + (Math.pow(2, Math.ceil(Math.log(e + s)/Math.log(2.0)))).toInt.toString + s"_t"
           }
         case dt => s"uint${dt.getWidth}_t"
       }
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index 554bcdeb..c85b6816 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -32,6 +32,8 @@ object GemminiISA {
   val LOOP_CONV_WS_CONFIG_5 = 20.U // *weights | *output
   val LOOP_CONV_WS_CONFIG_6 = 21.U // *bias, *input
 
+  val CLKGATE_EN = 22.U
+
   // rs1[2:0] values
   val CONFIG_EX = 0.U
   val CONFIG_LOAD = 1.U
diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala
index b003fd7b..cce6bcae 100644
--- a/src/main/scala/gemmini/LocalAddr.scala
+++ b/src/main/scala/gemmini/LocalAddr.scala
@@ -81,3 +81,13 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en
 
   override def cloneType: LocalAddr.this.type = new LocalAddr(sp_banks, sp_bank_entries, acc_banks, acc_bank_entries).asInstanceOf[this.type]
 }
+
+object LocalAddr {
+  def cast_to_local_addr[T <: Data](local_addr_t: LocalAddr, t: T): LocalAddr = {
+    // This convenience function is basically the same as calling "asTypeOf(local_addr_t)". However, this convenience
+    // function will also cast unnecessary garbage bits to 0, which may help reduce multiplier/adder bitwidths
+    val result = WireInit(t.asTypeOf(local_addr_t))
+    if (result.garbage_bit.getWidth > 0) result.garbage := 0.U
+    result
+  }
+}
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 47cd5a39..1f27f3ff 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -6,6 +6,7 @@ import chisel3.experimental._
 import freechips.rocketchip.tile.RoCCCommand
 import freechips.rocketchip.config.Parameters
 import GemminiISA._
+import LocalAddr.cast_to_local_addr
 import Util._
 
 class LoopConvOuterBounds(val large_iterator_bitwidth: Int, val small_iterator_bitwidth: Int, val tiny_iterator_bitwidth: Int) extends Bundle {
@@ -172,7 +173,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
     mvin_cmd_rs2 := DontCare
     mvin_cmd_rs2.num_rows := o.I.asUInt()
     mvin_cmd_rs2.num_cols := o.J.asUInt()
-    mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+    mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, o.spad_addr)
     io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
   }
 
@@ -343,7 +344,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
     mvin_cmd_rs2 := DontCare
     mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt()
     mvin_cmd_rs2.num_cols := o.K.asUInt()
-    mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+    mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, o.spad_addr)
     io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
   }
 
@@ -388,7 +389,7 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
   val outer_bounds = new LoopConvOuterBounds(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth)
   val inner_bounds = new LoopConvInnerBounds(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth)
   val derived_params = new LoopConvDerivedParams(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth)
-  val addr_end = UInt(log2Up(max_addr).W)
+  val addr_end = UInt(log2Up(max_addr+1).W)
   val dram_addr = UInt(coreMaxAddrBits.W)
   val trans_weight_1203 = Bool()
   val trans_weight_0132 = Bool()
@@ -513,7 +514,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
     mvin_cmd_rs2 := DontCare
     mvin_cmd_rs2.num_rows := o.K
     mvin_cmd_rs2.num_cols := o.J
-    mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+    mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, o.spad_addr)
     io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
   }
 
@@ -556,7 +557,7 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi
   val inner_bounds = new LoopConvInnerBounds(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth)
   val derived_params = new LoopConvDerivedParams(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth)
   val a_addr_start = UInt(log2Up(max_addr).W)
-  val b_addr_end = UInt(log2Up(max_addr).W)
+  val b_addr_end = UInt(log2Up(max_addr+1).W)
   val c_addr_start = UInt(log2Up(max_acc_addr).W)
   val wrot180 = Bool()
   val downsample = Bool()
@@ -719,13 +720,13 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
     pre_cmd_rs1 := DontCare
     pre_cmd_rs1.num_rows := o.K.asUInt()
     pre_cmd_rs1.num_cols := o.J.asUInt()
-    pre_cmd_rs1.local_addr := o.pre_addr.asTypeOf(pre_cmd_rs1.local_addr)
+    pre_cmd_rs1.local_addr := cast_to_local_addr(pre_cmd_rs1.local_addr, o.pre_addr)
 
     val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType)
     pre_cmd_rs2 := DontCare
     pre_cmd_rs2.num_rows := o.I.asUInt()
     pre_cmd_rs2.num_cols := o.J.asUInt()
-    pre_cmd_rs2.local_addr := o.c_addr.asTypeOf(pre_cmd_rs2.local_addr)
+    pre_cmd_rs2.local_addr := cast_to_local_addr(pre_cmd_rs2.local_addr, o.c_addr)
 
     io.cmd.bits.rs1 := pre_cmd_rs1.asUInt()
     io.cmd.bits.rs2 := pre_cmd_rs2.asUInt()
@@ -735,13 +736,13 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
     comp_cmd_rs1 := DontCare
     comp_cmd_rs1.num_rows := o.I.asUInt()
     comp_cmd_rs1.num_cols := o.K.asUInt()
-    comp_cmd_rs1.local_addr := o.a_addr.asTypeOf(comp_cmd_rs1.local_addr)
+    comp_cmd_rs1.local_addr := cast_to_local_addr(comp_cmd_rs1.local_addr, o.a_addr)
 
     val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType)
     comp_cmd_rs2 := DontCare
     comp_cmd_rs2.num_rows := o.I.asUInt()
     comp_cmd_rs2.num_cols := o.J.asUInt()
-    comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr)
+    comp_cmd_rs2.local_addr := cast_to_local_addr(comp_cmd_rs2.local_addr, GARBAGE_ADDR)
 
     io.cmd.bits.rs1 := comp_cmd_rs1.asUInt()
     io.cmd.bits.rs2 := comp_cmd_rs2.asUInt()
@@ -967,7 +968,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
       val pool_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
       pool_mvout_cmd_rs2 := DontCare
       pool_mvout_cmd_rs2.num_cols := o.channels
-      pool_mvout_cmd_rs2.local_addr := o.pool_spad_addr.asTypeOf(pool_mvout_cmd_rs2.local_addr)
+      pool_mvout_cmd_rs2.local_addr := cast_to_local_addr(pool_mvout_cmd_rs2.local_addr, o.pool_spad_addr)
 
       io.cmd.bits.rs1 := o.pool_dram_addr
       io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt()
@@ -976,7 +977,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
       mvout_cmd_rs2 := DontCare
       mvout_cmd_rs2.num_rows := o.I.asUInt()
       mvout_cmd_rs2.num_cols := o.J.asUInt()
-      mvout_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvout_cmd_rs2.local_addr)
+      mvout_cmd_rs2.local_addr := cast_to_local_addr(mvout_cmd_rs2.local_addr, o.spad_addr)
 
       io.cmd.bits.rs1 := o.dram_addr
       io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt()
@@ -1067,7 +1068,7 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s
   def all_completed(dummy: Int=0): Bool = ld_bias_completed && ld_input_completed && ld_weights_completed && ex_completed && st_completed
 
   val a_addr_start = UInt(log2Up(max_addr).W)
-  val b_addr_end = UInt(log2Up(max_addr).W)
+  val b_addr_end = UInt(log2Up(max_addr+1).W)
 
   def derived_params(dummy: Int=0): LoopConvDerivedParams = {
     import outer_bounds.{stride, kernel_dilation}
@@ -1453,7 +1454,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
     loops.zipWithIndex.foreach { case (l, i) =>
       l.reset()
       l.a_addr_start := (i * (max_addr / concurrent_loops)).U
-      l.b_addr_end := ((i+1) * (max_addr / concurrent_loops) - block_size).U
+      l.b_addr_end := ((i+1) * (max_addr / concurrent_loops)).U
     }
   }
 }
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index ea1c3ed6..791b43d5 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -6,6 +6,7 @@ import chisel3.experimental._
 import freechips.rocketchip.tile.RoCCCommand
 import freechips.rocketchip.config.Parameters
 import GemminiISA._
+import LocalAddr.cast_to_local_addr
 import Util._
 
 // LdA
@@ -75,7 +76,7 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvin_cmd_rs2 := DontCare
   mvin_cmd_rs2.num_rows := rows.asUInt()
   mvin_cmd_rs2.num_cols := cols.asUInt()
-  mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+  mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, sp_addr)
   mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
@@ -122,7 +123,7 @@ class LoopMatmulLdBReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat
   val dram_addr = UInt(coreMaxAddrBits.W)
   val dram_stride = UInt(coreMaxAddrBits.W)
   val transpose = Bool()
-  val addr_end = UInt(log2Up(max_addr).W)
+  val addr_end = UInt(log2Up(max_addr+1).W)
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
@@ -182,7 +183,7 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvin_cmd_rs2 := DontCare
   mvin_cmd_rs2.num_rows := rows.asUInt()
   mvin_cmd_rs2.num_cols := cols.asUInt()
-  mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+  mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, sp_addr)
   mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
@@ -278,7 +279,7 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvin_cmd_rs2 := DontCare
   mvin_cmd_rs2.num_rows := rows.asUInt()
   mvin_cmd_rs2.num_cols := cols.asUInt()
-  mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr)
+  mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, sp_addr)
   mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
@@ -325,7 +326,7 @@ class LoopMatmulExecuteReq(val block_size: Int, val coreMaxAddrBits: Int, val it
   val b_tranpose = Bool()
   val accumulate = Bool()
   val a_addr_start = UInt(log2Up(max_addr).W)
-  val b_addr_end = UInt(log2Up(max_addr).W)
+  val b_addr_end = UInt(log2Up(max_addr+1).W)
   val c_addr_start = UInt(log2Up(max_acc_addr).W)
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
@@ -405,13 +406,13 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
   pre_cmd_rs1 := DontCare
   pre_cmd_rs1.num_rows := b_rows.asUInt()
   pre_cmd_rs1.num_cols := b_cols.asUInt()
-  pre_cmd_rs1.local_addr := pre_addr.asTypeOf(pre_cmd_rs1.local_addr)
+  pre_cmd_rs1.local_addr := cast_to_local_addr(pre_cmd_rs1.local_addr, pre_addr)
 
   val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType)
   pre_cmd_rs2 := DontCare
   pre_cmd_rs2.num_rows := c_rows.asUInt()
   pre_cmd_rs2.num_cols := c_cols.asUInt()
-  pre_cmd_rs2.local_addr := out_addr.asTypeOf(pre_cmd_rs2.local_addr)
+  pre_cmd_rs2.local_addr := cast_to_local_addr(pre_cmd_rs2.local_addr, out_addr)
 
   pre_cmd.rs1 := pre_cmd_rs1.asUInt()
   pre_cmd.rs2 := pre_cmd_rs2.asUInt()
@@ -424,13 +425,13 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
   comp_cmd_rs1 := DontCare
   comp_cmd_rs1.num_rows := a_rows.asUInt()
   comp_cmd_rs1.num_cols := a_cols.asUInt()
-  comp_cmd_rs1.local_addr := a_addr.asTypeOf(comp_cmd_rs1.local_addr)
+  comp_cmd_rs1.local_addr := cast_to_local_addr(comp_cmd_rs1.local_addr, a_addr)
 
   val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType)
   comp_cmd_rs2 := DontCare
   comp_cmd_rs2.num_rows := block_size.U
   comp_cmd_rs2.num_cols := block_size.U
-  comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr)
+  comp_cmd_rs2.local_addr := cast_to_local_addr(comp_cmd_rs2.local_addr, GARBAGE_ADDR)
 
   comp_cmd.rs1 := comp_cmd_rs1.asUInt()
   comp_cmd.rs2 := comp_cmd_rs2.asUInt()
@@ -545,7 +546,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvout_cmd_rs2 := DontCare
   mvout_cmd_rs2.num_rows := rows.asUInt()
   mvout_cmd_rs2.num_cols := cols.asUInt()
-  mvout_cmd_rs2.local_addr := sp_addr.asTypeOf(mvout_cmd_rs2.local_addr)
+  mvout_cmd_rs2.local_addr := cast_to_local_addr(mvout_cmd_rs2.local_addr, sp_addr)
   mvout_cmd.rs2 := mvout_cmd_rs2.asUInt()
 
   io.req.ready := state === idle
@@ -636,7 +637,7 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val
   def all_completed(dummy: Int=0): Bool = lda_completed && ldb_completed && ldd_completed && ex_completed && st_completed
 
   val a_addr_start = UInt(log2Up(max_addr).W)
-  val b_addr_end = UInt(log2Up(max_addr).W)
+  val b_addr_end = UInt(log2Up(max_addr+1).W)
 
   def reset(): Unit = {
     configured := false.B
@@ -958,7 +959,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
     loops.zipWithIndex.foreach { case (l, i) =>
       l.reset()
       l.a_addr_start := (i * (max_addr / concurrent_loops)).U
-      l.b_addr_end := ((i+1) * (max_addr / concurrent_loops) - block_size).U
+      l.b_addr_end := ((i+1) * (max_addr / concurrent_loops)).U
     }
   }
 }
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index e3289b7f..0d76758d 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -6,12 +6,11 @@ import freechips.rocketchip.config.Parameters
 import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile._
-import freechips.rocketchip.tilelink.{TLIdentityNode, TLXbar}
+import freechips.rocketchip.tilelink.{TLIdentityNode, TLXbar, TLBuffer}
 
 import Util._
 
-class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)
-                              (implicit p: Parameters) extends CoreBundle {
+class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle {
   val vaddr = UInt(coreMaxAddrBits.W)
   val laddr = local_addr_t.cloneType
 
@@ -57,15 +56,13 @@ class ScratchpadMemReadResponse extends Bundle {
   val cmd_id = UInt(8.W) // TODO don't use a magic number here
 }
 
-class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)
-                         (implicit p: Parameters) extends CoreBundle {
+class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle {
   val req = Decoupled(new ScratchpadMemReadRequest(local_addr_t, scale_t_bits))
   val resp = Flipped(Valid(new ScratchpadMemReadResponse))
 
   override def cloneType: this.type = new ScratchpadReadMemIO(local_addr_t, scale_t_bits).asInstanceOf[this.type]
 }
 
-// class ScratchpadWriteMemIO(val nBanks: Int, val nRows: Int, val acc_rows: Int)
 class ScratchpadWriteMemIO(local_addr_t: LocalAddr, scale_t_bits: Int)
                          (implicit p: Parameters) extends CoreBundle {
   val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, scale_t_bits))
@@ -96,7 +93,7 @@ class ScratchpadWriteIO(val n: Int, val w: Int, val mask_len: Int) extends Bundl
   val data = Output(UInt(w.W))
 }
 
-class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean) extends Module {
+class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean, use_shared_ext_mem: Boolean) extends Module {
   // This is essentially a pipelined SRAM with the ability to stall pipeline stages
 
   require(w % aligned_to == 0 || w < aligned_to)
@@ -106,27 +103,50 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean) ex
   val io = IO(new Bundle {
     val read = Flipped(new ScratchpadReadIO(n, w))
     val write = Flipped(new ScratchpadWriteIO(n, w, mask_len))
+    val ext_mem = if (use_shared_ext_mem) Some(new ExtMemIO) else None
   })
 
-  val mem = SyncReadMem(n, Vec(mask_len, mask_elem))
+  val (read, write) = if (use_shared_ext_mem) {
+    def read(addr: UInt, ren: Bool): Data = {
+      io.ext_mem.get.read_en := ren
+      io.ext_mem.get.read_addr := addr
+      io.ext_mem.get.read_data
+    }
+    io.ext_mem.get.write_en := false.B
+    io.ext_mem.get.write_addr := DontCare
+    io.ext_mem.get.write_data := DontCare
+    io.ext_mem.get.write_mask := DontCare
+    def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = {
+      io.ext_mem.get.write_en := true.B
+      io.ext_mem.get.write_addr := addr
+      io.ext_mem.get.write_data := wdata.asUInt
+      io.ext_mem.get.write_mask := wmask.asUInt
+    }
+    (read _, write _)
+  } else {
+    val mem = SyncReadMem(n, Vec(mask_len, mask_elem))
+    def read(addr: UInt, ren: Bool): Data = mem.read(addr, ren)
+    def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = mem.write(addr, wdata, wmask)
+    (read _, write _)
+  }
 
   // When the scratchpad is single-ported, the writes take precedence
   val singleport_busy_with_write = single_ported.B && io.write.en
 
   when (io.write.en) {
     if (aligned_to >= w)
-      mem.write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)))
+      write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)), VecInit((~(0.U(mask_len.W))).asBools))
     else
-      mem.write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)), io.write.mask)
+      write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)), io.write.mask)
   }
 
   val raddr = io.read.req.bits.addr
   val ren = io.read.req.fire()
   val rdata = if (single_ported) {
     assert(!(ren && io.write.en))
-    mem.read(raddr, ren && !io.write.en).asUInt()
+    read(raddr, ren && !io.write.en).asUInt()
   } else {
-    mem.read(raddr, ren).asUInt()
+    read(raddr, ren).asUInt()
   }
 
   val fromDMA = io.read.req.bits.fromDMA
@@ -143,6 +163,7 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean) ex
   io.read.resp <> q.io.deq
 }
 
+
 class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V])
     (implicit p: Parameters, ev: Arithmetic[T]) extends LazyModule {
 
@@ -171,9 +192,9 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
   // id_node :=* reader.node
   // id_node :=* writer.node
 
-  xbar_node := reader.node // TODO
-  xbar_node := writer.node
-  id_node := xbar_node
+  xbar_node := TLBuffer() := reader.node // TODO
+  xbar_node := TLBuffer() := writer.node
+  id_node := TLBuffer() := xbar_node
 
   lazy val module = new LazyModuleImp(this) with HasCoreParameters {
 
@@ -204,6 +225,12 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         ))))
       }
 
+      val ext_mem = if (use_shared_ext_mem) {
+        Some(new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks))
+      } else {
+        None
+      }
+
       // TLB ports
       val tlb = Vec(2, new FrontendTLBIO)
 
@@ -368,12 +395,19 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
     io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid
 
-    {
-      val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank(sp_bank_entries, spad_w, aligned_to, config.sp_singleported)) }
+    val spad_mems = {
+      val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank(
+        sp_bank_entries, spad_w,
+        aligned_to, config.sp_singleported,
+        use_shared_ext_mem
+      )) }
       val bank_ios = VecInit(banks.map(_.io))
-
       // Reading from the SRAM banks
       bank_ios.zipWithIndex.foreach { case (bio, i) =>
+        if (use_shared_ext_mem) {
+          io.ext_mem.get.spad(i) <> bio.ext_mem.get
+        }
+
         val ex_read_req = io.srams.read(i).req
         val exread = ex_read_req.valid
 
@@ -414,7 +448,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         val dma_read_pipe = Pipeline(dma_read_resp, spad_read_delay)
         val ex_read_pipe = Pipeline(ex_read_resp, spad_read_delay)
 
-
         bio.read.resp.ready := Mux(bio.read.resp.bits.fromDMA, dma_read_resp.ready, ex_read_resp.ready)
 
         dma_read_pipe.ready := writer.module.io.req.ready &&
@@ -472,6 +505,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.write.mask := DontCare
         }
       }
+      banks
     }
 
     val acc_row_t = Vec(meshColumns, Vec(tileColumns, accType))
@@ -513,11 +547,14 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       }
     }
 
-    {
+    val acc_adders = Module(new AccPipeShared(acc_latency-1, acc_row_t, acc_banks))
 
+    val acc_mems = {
       val banks = Seq.fill(acc_banks) { Module(new AccumulatorMem(
         acc_bank_entries, acc_row_t, acc_scale_func, acc_scale_t.asInstanceOf[V],
-        acc_singleported, acc_sub_banks
+        acc_singleported, acc_sub_banks,
+        use_shared_ext_mem,
+        acc_latency, accType,
       )) }
       val bank_ios = VecInit(banks.map(_.io))
 
@@ -526,6 +563,15 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
       // Reading from the Accumulator banks
       bank_ios.zipWithIndex.foreach { case (bio, i) =>
+        if (use_shared_ext_mem) {
+          io.ext_mem.get.acc(i) <> bio.ext_mem.get
+        }
+
+        acc_adders.io.in_sel(i) := bio.adder.valid
+        acc_adders.io.ina(i) := bio.adder.op1
+        acc_adders.io.inb(i) := bio.adder.op2
+        bio.adder.sum := acc_adders.io.out
+
         val ex_read_req = io.acc.read_req(i)
         val exread = ex_read_req.valid
 
@@ -677,6 +723,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.write.bits.mask := DontCare
         }
       }
+      banks
     }
 
     // Counter connection
diff --git a/src/main/scala/gemmini/SharedExtMem.scala b/src/main/scala/gemmini/SharedExtMem.scala
new file mode 100644
index 00000000..9d0e1802
--- /dev/null
+++ b/src/main/scala/gemmini/SharedExtMem.scala
@@ -0,0 +1,80 @@
+package gemmini
+
+import chisel3._
+import chisel3.util._
+
+import Util._
+
+
+class ExtMemIO extends Bundle {
+  val read_en = Output(Bool())
+  val read_addr = Output(UInt())
+  val read_data = Input(UInt())
+
+  val write_en = Output(Bool())
+  val write_addr = Output(UInt())
+  val write_data = Output(UInt())
+  val write_mask = Output(UInt())
+}
+
+class ExtSpadMemIO(sp_banks: Int, acc_banks: Int, acc_sub_banks: Int) extends Bundle {
+  val spad = Vec(sp_banks, new ExtMemIO)
+  val acc = Vec(acc_banks, Vec(acc_sub_banks, new ExtMemIO))
+  override def cloneType: this.type = new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks).asInstanceOf[this.type]
+}
+
+
+class SharedSyncReadMem(nSharers: Int, depth: Int, mask_len: Int, data_len: Int) extends Module {
+  val io = IO(new Bundle {
+    val in = Vec(nSharers, Flipped(new ExtMemIO()))
+  })
+  val mem = SyncReadMem(depth, Vec(mask_len, UInt(data_len.W)))
+  val wens = io.in.map(_.write_en)
+  val wen = wens.reduce(_||_)
+  val waddr = Mux1H(wens, io.in.map(_.write_addr))
+  val wmask = Mux1H(wens, io.in.map(_.write_mask))
+  val wdata = Mux1H(wens, io.in.map(_.write_data))
+  assert(PopCount(wens) <= 1.U)
+  val rens = io.in.map(_.read_en)
+  assert(PopCount(rens) <= 1.U)
+  val ren = rens.reduce(_||_)
+  val raddr = Mux1H(rens, io.in.map(_.read_addr))
+  val rdata = mem.read(raddr, ren && !wen)
+  io.in.foreach(_.read_data := rdata.asUInt)
+  when (wen) {
+    mem.write(waddr, wdata.asTypeOf(Vec(mask_len, UInt(data_len.W))), wmask.asTypeOf(Vec(mask_len, Bool())))
+  }
+
+}
+
+class SharedExtMem(
+  sp_banks: Int, acc_banks: Int, acc_sub_banks: Int,
+  sp_depth: Int, sp_mask_len: Int, sp_data_len: Int,
+  acc_depth: Int, acc_mask_len: Int, acc_data_len: Int
+) extends Module {
+  val nSharers = 2
+  val io = IO(new Bundle {
+    val in = Vec(nSharers, Flipped(new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks)))
+  })
+  for (i <- 0 until sp_banks) {
+    val spad_mem = Module(new SharedSyncReadMem(nSharers, sp_depth, sp_mask_len, sp_data_len))
+    for (w <- 0 until nSharers) {
+      spad_mem.io.in(w) <> io.in(w).spad(i)
+    }
+  }
+  for (i <- 0 until acc_banks) {
+    for (s <- 0 until acc_sub_banks) {
+      val acc_mem = Module(new SharedSyncReadMem(nSharers, acc_depth, acc_mask_len, acc_data_len))
+
+      acc_mem.io.in(0) <> io.in(0).acc(i)(s)
+      // The FP gemmini expects a taller, skinnier accumulator mem
+      acc_mem.io.in(1) <> io.in(1).acc(i)(s)
+      acc_mem.io.in(1).read_addr := io.in(1).acc(i)(s).read_addr >> 1
+      io.in(1).acc(i)(s).read_data := acc_mem.io.in(1).read_data.asTypeOf(Vec(2, UInt((acc_data_len * acc_mask_len / 2).W)))(RegNext(io.in(1).acc(i)(s).read_addr(0)))
+
+      acc_mem.io.in(1).write_addr := io.in(1).acc(i)(s).write_addr >> 1
+      acc_mem.io.in(1).write_data := Cat(io.in(1).acc(i)(s).write_data, io.in(1).acc(i)(s).write_data)
+      acc_mem.io.in(1).write_mask := Mux(io.in(1).acc(i)(s).write_addr(0), io.in(1).acc(i)(s).write_mask << (acc_mask_len / 2), io.in(1).acc(i)(s).write_mask)
+    }
+  }
+}
diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala
index d1cefcb3..05480e09 100644
--- a/src/main/scala/gemmini/VectorScalarMultiplier.scala
+++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala
@@ -120,7 +120,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
       head_oh := (head_oh << 1) | head_oh(nEntries-1)
     }
     in_fire := (in.valid &&
-      (!Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && io.resp.fire()))
+      (!Mux1H(tail_oh.asBools, regs.map(_.valid)))
     )
     when (in_fire) {
       for (i <- 0 until nEntries) {
@@ -193,7 +193,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
 
 
   }
-  
+
 
 }
 

From f0419e7f308884570a61973ea7b483ef5a306bcf Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Thu, 2 Dec 2021 16:39:08 -0800
Subject: [PATCH 07/11] Add option to pick between mac chains and mac trees
 (#167)

Also, make it easier to pipeline reduction trees by pipelining tiles rather than PEs
---
 src/main/scala/gemmini/Configs.scala          |  4 ++-
 src/main/scala/gemmini/ConfigsFP.scala        | 10 +++---
 src/main/scala/gemmini/DSEConfigs.scala       |  3 +-
 .../scala/gemmini/ExecuteController.scala     |  6 ++--
 src/main/scala/gemmini/GemminiConfigs.scala   |  7 ++--
 src/main/scala/gemmini/Mesh.scala             | 33 ++++++++++++-------
 src/main/scala/gemmini/MeshWithDelays.scala   | 29 ++++++++--------
 src/main/scala/gemmini/PE.scala               | 20 +++++------
 src/main/scala/gemmini/Tile.scala             | 21 +++++++++---
 src/main/scala/gemmini/Util.scala             | 16 +++++++++
 10 files changed, 96 insertions(+), 53 deletions(-)

diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 7ceffcfe..a4094d88 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -174,6 +174,7 @@ object GemminiConfigs {
   )
 
   val largeChipConfig = chipConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64),
+    tileRows=1, tileColumns=1,
     meshRows=32, meshColumns=32
   )
 
@@ -219,6 +220,7 @@ class DualGemminiConfig extends Config((site, here, up) => {
       fp_gemmini = LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig.copy(
         opcodes = OpcodeSet.custom2,
         sp_capacity=CapacityInKilobytes(64), acc_capacity=CapacityInKilobytes(32),
+        tileColumns = 1, tileRows = 1,
         meshColumns = 8, meshRows = 8,
         acc_singleported = true, acc_banks = 2, acc_sub_banks = 2,
         use_shared_ext_mem = true,
@@ -226,7 +228,7 @@ class DualGemminiConfig extends Config((site, here, up) => {
         ex_write_to_spad=false,
         hardcode_d_to_garbage_addr = true,
         headerFileName = "gemmini_params_bf16.h",
-	acc_latency = 3,
+        acc_latency = 3,
         dataflow = Dataflow.WS,
         mesh_output_delay = 3,
         clock_gate = true
diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
index 91a4dbd2..35ecf821 100644
--- a/src/main/scala/gemmini/ConfigsFP.scala
+++ b/src/main/scala/gemmini/ConfigsFP.scala
@@ -61,7 +61,7 @@ object GemminiFPConfigs {
     acc_read_full_width = true,
     acc_read_small_width = true,
 
-    pe_latency = 1,
+    tile_latency = 1,
 
     ex_read_from_spad = true,
     ex_read_from_acc = true,
@@ -81,21 +81,21 @@ object GemminiFPConfigs {
   
   //FP32 Single Precision Configuration
   val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24),
-                                               pe_latency = 2,
+                                               tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
  
   //FP16 Half Precision Configuration
   val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24),
-                                               pe_latency = 2,
+                                               tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
   
   //Bfloat16 Brain-half Precision Configuration
   val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24),
-                                               pe_latency = 2,
+                                               tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
@@ -103,7 +103,7 @@ object GemminiFPConfigs {
   //Bfloat16 Brain-half Precision Configuration 8x8 array
   val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24),
                                                meshRows = 8, meshColumns = 8,
-                                               pe_latency = 2,
+                                               tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index 0d4681b5..f00297e3 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -59,8 +59,9 @@ object DSEBaseConfig {
     acc_read_full_width = true,
     acc_read_small_width = true,
     use_dedicated_tl_port = false,
+
     use_shared_ext_mem = true,
-    pe_latency = 0,
+    tile_latency = 0,
 
     ex_read_from_spad = true,
     ex_read_from_acc = true,
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 9d1cf094..6891c09b 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -187,7 +187,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val cntl = mesh_cntl_signals_q.io.deq.bits
 
   // Instantiate the actual mesh
-  val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, pe_latency, mesh_output_delay,
+  val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, tree_reduction, tile_latency, mesh_output_delay,
     tileRows, tileColumns, meshRows, meshColumns, shifter_banks, shifter_banks))
 
   mesh.io.a.valid := false.B
@@ -891,12 +891,12 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
 
   when (cntl_valid && cntl.perform_single_preload) {
     mesh.io.a.bits := Mux(a_should_be_fed_into_transposer, dataA.asUInt, 0.U).asTypeOf(Vec(meshRows, Vec(tileRows, inputType)))
-    mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, dataB.asUInt, 0.U).asTypeOf(Vec(meshRows, Vec(tileRows, inputType)))
+    mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, dataB.asUInt, 0.U).asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))
   }
 
   when (cntl_valid && cntl.perform_single_mul) {
     mesh.io.a.bits := Mux(a_should_be_fed_into_transposer, 0.U, dataA.asUInt).asTypeOf(Vec(meshRows, Vec(tileRows, inputType)))
-    mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, 0.U, dataB.asUInt).asTypeOf(Vec(meshRows, Vec(tileRows, inputType)))
+    mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, 0.U, dataB.asUInt).asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))
     mesh.io.req.bits.tag.addr.make_this_garbage()
   }
 
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index beb46c71..45b6a778 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -58,8 +58,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              mvin_scale_shared: Boolean = false,
                                                                              acc_scale_args: Option[ScaleArguments[T, V]] = None,
 
-                                                                             pe_latency: Int = 0,
-
                                                                              acc_read_full_width: Boolean = true,
                                                                              acc_read_small_width: Boolean = true,
                                                                              use_dedicated_tl_port: Boolean = true,
@@ -76,8 +74,11 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              hardcode_d_to_garbage_addr: Boolean = false,
                                                                              use_shared_tlb: Boolean = true,
 
+                                                                             tile_latency: Int = 0,
                                                                              mesh_output_delay: Int = 1,
 
+                                                                             use_tree_reduction_if_possible: Boolean = true,
+
                                                                              num_counter: Int = 8,
 
                                                                              has_training_convs: Boolean = true,
@@ -162,6 +163,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
 
   val hasIm2Col = false
 
+  val tree_reduction = use_tree_reduction_if_possible && dataflow == Dataflow.WS && tileRows > 1
+
   //==========================================================================
   // sanity check mesh size
   //==========================================================================
diff --git a/src/main/scala/gemmini/Mesh.scala b/src/main/scala/gemmini/Mesh.scala
index 5bb924c5..cd056658 100644
--- a/src/main/scala/gemmini/Mesh.scala
+++ b/src/main/scala/gemmini/Mesh.scala
@@ -15,7 +15,7 @@ import chisel3.experimental._
   * @param meshColumns
   */
 class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
-                                   df: Dataflow.Value, pe_latency: Int,
+                                   df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int,
                                    max_simultaneous_matmuls: Int, output_delay: Int,
                                    val tileRows: Int, val tileColumns: Int,
                                    val meshRows: Int, val meshColumns: Int) extends Module {
@@ -34,43 +34,54 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
     val out_id = Output(Vec(meshColumns, Vec(tileColumns, UInt(log2Up(max_simultaneous_matmuls).W))))
     val out_last = Output(Vec(meshColumns, Vec(tileColumns, Bool())))
   })
+
   // mesh(r)(c) => Tile at row r, column c
-  val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, outputType, accType, df, pe_latency, max_simultaneous_matmuls, tileRows, tileColumns)))
+  val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, outputType, accType, df, tree_reduction, max_simultaneous_matmuls, tileRows, tileColumns)))
   val meshT = mesh.transpose
+
+  def pipe[T <: Data](valid: Bool, t: T, latency: Int): T = {
+    // The default "Pipe" function apparently resets the valid signals to false.B. We would like to avoid using global
+    // signals in the Mesh, so over here, we make it clear that the reset signal will never be asserted
+    chisel3.withReset(false.B) { Pipe(valid, t, latency).bits }
+  }
+
   // Chain tile_a_out -> tile_a_in (pipeline a across each row)
   // TODO clock-gate A signals with in_garbage
   for (r <- 0 until meshRows) {
     mesh(r).foldLeft(io.in_a(r)) {
       case (in_a, tile) =>
-        tile.io.in_a := RegNext(in_a)
+        tile.io.in_a := ShiftRegister(in_a, tile_latency+1)
         tile.io.out_a
     }
   }
+
   // Chain tile_out_b -> tile_b_in (pipeline b across each column)
   for (c <- 0 until meshColumns) {
     meshT(c).foldLeft((io.in_b(c), io.in_valid(c))) {
       case ((in_b, valid), tile) =>
-        tile.io.in_b := RegEnable(in_b, valid.head)
+        tile.io.in_b := pipe(valid.head, in_b, tile_latency+1)
         (tile.io.out_b, tile.io.out_valid)
     }
   }
+
   // Chain tile_out -> tile_propag (pipeline output across each column)
   for (c <- 0 until meshColumns) {
     meshT(c).foldLeft((io.in_d(c), io.in_valid(c))) {
       case ((in_propag, valid), tile) =>
-        tile.io.in_d := RegEnable(in_propag, valid.head)
+        tile.io.in_d := pipe(valid.head, in_propag, tile_latency+1)
         (tile.io.out_c, tile.io.out_valid)
     }
   }
+
   // Chain control signals (pipeline across each column)
   assert(!(mesh.map(_.map(_.io.bad_dataflow).reduce(_||_)).reduce(_||_)))
   for (c <- 0 until meshColumns) {
     meshT(c).foldLeft((io.in_control(c), io.in_valid(c))) {
       case ((in_ctrl, valid), tile) =>
         (tile.io.in_control, in_ctrl, valid).zipped.foreach { case (tile_ctrl, ctrl, v) =>
-          tile_ctrl.shift := RegEnable(ctrl.shift, v)
-          tile_ctrl.dataflow := RegEnable(ctrl.dataflow, v)
-          tile_ctrl.propagate := RegEnable(ctrl.propagate, v)
+          tile_ctrl.shift := pipe(v, ctrl.shift, tile_latency+1)
+          tile_ctrl.dataflow := pipe(v, ctrl.dataflow, tile_latency+1)
+          tile_ctrl.propagate := pipe(v, ctrl.propagate, tile_latency+1)
         }
         (tile.io.out_control, tile.io.out_valid)
     }
@@ -80,7 +91,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
   for (c <- 0 until meshColumns) {
     meshT(c).foldLeft(io.in_valid(c)) {
       case (in_v, tile) =>
-        tile.io.in_valid := RegNext(in_v)
+        tile.io.in_valid := ShiftRegister(in_v, tile_latency+1)
         tile.io.out_valid
     }
   }
@@ -89,7 +100,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
   for (c <- 0 until meshColumns) {
     meshT(c).foldLeft(io.in_id(c)) {
       case (in_id, tile) =>
-        tile.io.in_id := RegNext(in_id)
+        tile.io.in_id := ShiftRegister(in_id, tile_latency+1)
         tile.io.out_id
     }
   }
@@ -98,7 +109,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
   for (c <- 0 until meshColumns) {
     meshT(c).foldLeft(io.in_last(c)) {
       case (in_last, tile) =>
-        tile.io.in_last := RegNext(in_last)
+        tile.io.in_last := ShiftRegister(in_last, tile_latency+1)
         tile.io.out_last
     }
   }
diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala
index acab135d..db40debf 100644
--- a/src/main/scala/gemmini/MeshWithDelays.scala
+++ b/src/main/scala/gemmini/MeshWithDelays.scala
@@ -33,7 +33,7 @@ class MeshWithDelaysResp[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](o
 
 class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   (inputType: T, val outputType: T, accType: T,
-   tagType: U, df: Dataflow.Value, pe_latency: Int, output_delay: Int,
+   tagType: U, df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int, output_delay: Int,
    tileRows: Int, tileColumns: Int, meshRows: Int, meshColumns: Int,
    leftBanks: Int, upBanks: Int, outBanks: Int = 1, n_simultaneous_matmuls: Int = -1)
   extends Module {
@@ -47,12 +47,13 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   assert(meshRows*tileRows == meshColumns*tileColumns)
   val block_size = meshRows*tileRows
 
+  val latency_per_pe = (tile_latency + 1).toFloat / (tileRows min tileColumns)
   val max_simultaneous_matmuls = if (n_simultaneous_matmuls == -1) {
-    5 * (pe_latency + 1)
+    (5 * latency_per_pe).ceil.toInt
   } else {
     n_simultaneous_matmuls
   }
-  assert(max_simultaneous_matmuls >= 5 * (pe_latency + 1))
+  assert(max_simultaneous_matmuls >= 5 * latency_per_pe)
 
   val tagqlen = max_simultaneous_matmuls+1
 
@@ -70,7 +71,6 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
 
   def shifted[T <: Data](x: Vec[Vec[T]], banks: Int, reverse: Boolean = false) = {
     assert(x.size % banks == 0, "cannot bank without clean divisors")
-    assert(pe_latency == 0 || (tileRows == 1 && tileColumns == 1), "If tiles are larger than 1x1, then PEs must have 0 latency")
 
     val banked_len = x.size / banks
     val banked_x = x.grouped(banked_len).toSeq
@@ -79,13 +79,13 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
 
     (banked_x zip indexes).flatMap { case (bx, i) =>
       val bxVec = VecInit(bx)
-      val sram_shift = i * banked_len * (pe_latency+1)
+      val sram_shift = i * banked_len * (tile_latency+1)
 
       val SRAMShifted = Shifter(bxVec, sram_shift, true.B, true)
 
       val indexes = if (reverse) SRAMShifted.indices.reverse else SRAMShifted.indices
       val RegShifted = (SRAMShifted zip indexes).map { case (srs, j) =>
-        ShiftRegister(srs, j*(pe_latency+1))
+        ShiftRegister(srs, j*(tile_latency+1))
       }
 
       RegShifted
@@ -166,25 +166,25 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   val transposer_out = VecInit(transposer.io.outCol.bits.grouped(tileRows).map(t => VecInit(t)).toSeq)
 
   // Wire up mesh's IO to this module's IO
-  val mesh = Module(new Mesh(inputType, outputType, accType, df, pe_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns))
+  val mesh = Module(new Mesh(inputType, outputType, accType, df, tree_reduction, tile_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns))
 
   // TODO wire only to *_buf here, instead of io.*.bits
-  val a_shifter_in = WireInit(Mux(a_is_from_transposer, transposer_out, a_buf))
-  val b_shifter_in = WireInit(Mux(b_is_from_transposer, transposer_out, b_buf))
+  val a_shifter_in = WireInit(Mux(a_is_from_transposer, transposer_out.asTypeOf(A_TYPE), a_buf))
+  val b_shifter_in = WireInit(Mux(b_is_from_transposer, transposer_out.asTypeOf(B_TYPE), b_buf))
   val d_shifter_in = WireInit(Mux(d_is_from_transposer,
-    VecInit(transposer_out.flatten.reverse.grouped(tileRows).map(VecInit(_)).toSeq), d_buf))
+    VecInit(transposer_out.flatten.reverse.grouped(tileRows).map(VecInit(_)).toSeq).asTypeOf(D_TYPE), d_buf))
 
   mesh.io.in_a := shifted(a_shifter_in, leftBanks)
   mesh.io.in_b := shifted(b_shifter_in, upBanks)
   mesh.io.in_d := shifted(d_shifter_in, upBanks)
 
   mesh.io.in_control.zipWithIndex.foreach { case (ss, i) =>
-    ss.foreach(_.dataflow := ShiftRegister(req.bits.pe_control.dataflow, i * (pe_latency + 1)))
-    ss.foreach(_.propagate := ShiftRegister(in_prop, i * (pe_latency + 1)))
+    ss.foreach(_.dataflow := ShiftRegister(req.bits.pe_control.dataflow, i * (tile_latency + 1)))
+    ss.foreach(_.propagate := ShiftRegister(in_prop, i * (tile_latency + 1)))
   }
   val result_shift = RegNext(req.bits.pe_control.shift) // TODO will this arrive at the right time if memory isn't pipelined?
   mesh.io.in_control.zipWithIndex.foreach { case (ctrl, i) =>
-    ctrl.foreach(_.shift := ShiftRegister(result_shift, i * (pe_latency + 1)))
+    ctrl.foreach(_.shift := ShiftRegister(result_shift, i * (tile_latency + 1)))
   }
 
   val not_paused_vec = VecInit(Seq.fill(meshColumns)(VecInit(Seq.fill(tileColumns)(!pause))))
@@ -198,8 +198,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
 
   // We want to output C when we're output-stationary, but B when we're weight-stationary
   // TODO these would actually overlap when we switch from output-stationary to weight-stationary
-  val out_pe_control = shifted(mesh.io.out_control, outBanks, reverse = true)(0)(0)
-  io.resp.bits.data := shifted(Mux(out_pe_control.dataflow === Dataflow.OS.id.U, mesh.io.out_c, mesh.io.out_b), outBanks, true)
+  io.resp.bits.data := shifted(Mux(mesh.io.out_control(0)(0).dataflow === Dataflow.OS.id.U, mesh.io.out_c, mesh.io.out_b), outBanks, true)
 
   io.resp.valid := shifted(mesh.io.out_valid, outBanks, reverse = true)(0)(0)
 
diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala
index 79944b72..e10318a3 100644
--- a/src/main/scala/gemmini/PE.scala
+++ b/src/main/scala/gemmini/PE.scala
@@ -17,7 +17,7 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle {
   * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh.
   * @param width Data width of operands
   */
-class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, latency: Int, max_simultaneous_matmuls: Int)
+class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, max_simultaneous_matmuls: Int)
                    (implicit ev: Arithmetic[T]) extends Module { // Debugging variables
   import ev._
 
@@ -46,17 +46,17 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
 
   val cType = if (df == Dataflow.WS) inputType else accType
 
-  val a  = ShiftRegister(io.in_a, latency)
-  val b  = ShiftRegister(io.in_b, latency)
-  val d  = ShiftRegister(io.in_d, latency)
+  val a  = io.in_a
+  val b  = io.in_b
+  val d  = io.in_d
   val c1 = Reg(cType)
   val c2 = Reg(cType)
-  val dataflow = ShiftRegister(io.in_control.dataflow, latency)
-  val prop  = ShiftRegister(io.in_control.propagate, latency)
-  val shift = ShiftRegister(io.in_control.shift, latency)
-  val id = ShiftRegister(io.in_id, latency)
-  val last = ShiftRegister(io.in_last, latency)
-  val valid = ShiftRegister(io.in_valid, latency) // TODO should we clockgate the rest of the ShiftRegisters based on the values in this ShiftRegisters
+  val dataflow = io.in_control.dataflow
+  val prop  = io.in_control.propagate
+  val shift = io.in_control.shift
+  val id = io.in_id
+  val last = io.in_last
+  val valid = io.in_valid
 
   io.out_a := a
   io.out_control.dataflow := dataflow
diff --git a/src/main/scala/gemmini/Tile.scala b/src/main/scala/gemmini/Tile.scala
index 59807893..9c2a418c 100644
--- a/src/main/scala/gemmini/Tile.scala
+++ b/src/main/scala/gemmini/Tile.scala
@@ -4,6 +4,7 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
+import Util._
 
 /**
   * A Tile is a purely combinational 2D array of passThrough PEs.
@@ -12,7 +13,7 @@ import chisel3.util._
   * @param rows Number of PEs on each row
   * @param columns Number of PEs on each column
   */
-class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df: Dataflow.Value, pe_latency: Int, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int) extends Module {
+class Tile[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, tree_reduction: Boolean, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int)(implicit ev: Arithmetic[T]) extends Module {
   val io = IO(new Bundle {
     val in_a        = Input(Vec(rows, inputType))
     val in_b        = Input(Vec(columns, outputType)) // This is the output of the tile next to it
@@ -32,11 +33,13 @@ class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df:
 
     val in_valid = Input(Vec(columns, Bool()))
     val out_valid = Output(Vec(columns, Bool()))
-    
+
     val bad_dataflow = Output(Bool())
   })
 
-  val tile = Seq.fill(rows, columns)(Module(new PE(inputType, outputType, accType, df, pe_latency, max_simultaneous_matmuls)))
+  import ev._
+
+  val tile = Seq.fill(rows, columns)(Module(new PE(inputType, outputType, accType, df, max_simultaneous_matmuls)))
   val tileT = tile.transpose
 
   // TODO: abstract hori/vert broadcast, all these connections look the same
@@ -53,7 +56,7 @@ class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df:
   for (c <- 0 until columns) {
     tileT(c).foldLeft(io.in_b(c)) {
       case (in_b, pe) =>
-        pe.io.in_b := in_b
+        pe.io.in_b := (if (tree_reduction) in_b.zero else in_b)
         pe.io.out_b
     }
   }
@@ -106,11 +109,19 @@ class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df:
   // Drive the Tile's bottom IO
   for (c <- 0 until columns) {
     io.out_c(c) := tile(rows-1)(c).io.out_c
-    io.out_b(c) := tile(rows-1)(c).io.out_b
     io.out_control(c) := tile(rows-1)(c).io.out_control
     io.out_id(c) := tile(rows-1)(c).io.out_id
     io.out_last(c) := tile(rows-1)(c).io.out_last
     io.out_valid(c) := tile(rows-1)(c).io.out_valid
+
+    io.out_b(c) := {
+      if (tree_reduction) {
+        val prods = tileT(c).map(_.io.out_b)
+        accumulateTree(prods :+ io.in_b(c))
+      } else {
+        tile(rows - 1)(c).io.out_b
+      }
+    }
   }
   io.bad_dataflow := tile.map(_.map(_.io.bad_dataflow).reduce(_||_)).reduce(_||_)
 
diff --git a/src/main/scala/gemmini/Util.scala b/src/main/scala/gemmini/Util.scala
index 511cfee2..907c4ad2 100644
--- a/src/main/scala/gemmini/Util.scala
+++ b/src/main/scala/gemmini/Util.scala
@@ -109,6 +109,22 @@ object Util {
     Mux(u1 < u2, u1, u2)
   }
 
+  def accumulateTree[T <: Data](xs: Seq[T])(implicit ev: Arithmetic[T]): T = {
+    import ev._
+
+    assert(xs.nonEmpty, "can't accumulate 0 elements")
+
+    if (xs.length == 1) {
+      xs.head
+    } else {
+      val upperRowLen = 1 << log2Ceil(xs.length)
+      val upperRow = xs.padTo(upperRowLen, xs.head.zero)
+      val pairs = upperRow.grouped(2)
+      val lowerRow = pairs.map { case Seq(a, b) => a + b }
+      accumulateTree(lowerRow.toSeq)
+    }
+  }
+
   // An undirectioned Valid bundle
   class UDValid[T <: Data](t: T) extends Bundle {
     val valid = Bool()

From 6f45fcc027442087ffbe8aaacd151e171b60b0c0 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 4 Dec 2021 22:04:52 -0800
Subject: [PATCH 08/11] Optimize conv layers with few input channels (#92)

We add a new experimental "pixel_repeats" feature to optimize conv layers with few input channels (like the first layer of most CNNs).
---
 SPIKE.hash                                    |   2 +-
 software/gemmini-rocc-tests                   |   2 +-
 src/main/scala/gemmini/Controller.scala       |   4 +-
 src/main/scala/gemmini/DMA.scala              |   7 +
 src/main/scala/gemmini/GemminiConfigs.scala   |  10 +-
 src/main/scala/gemmini/GemminiISA.scala       |  15 +-
 src/main/scala/gemmini/LoadController.scala   |  15 +-
 src/main/scala/gemmini/LocalAddr.scala        |  15 ++
 src/main/scala/gemmini/LoopConv.scala         |  31 ++++-
 src/main/scala/gemmini/PixelRepeater.scala    |  95 +++++++++++++
 .../scala/gemmini/ReservationStation.scala    |  17 ++-
 src/main/scala/gemmini/Scratchpad.scala       | 130 +++++++++++++-----
 .../gemmini/VectorScalarMultiplier.scala      |  10 +-
 src/main/scala/gemmini/XactTracker.scala      |   2 +
 14 files changed, 294 insertions(+), 61 deletions(-)
 create mode 100644 src/main/scala/gemmini/PixelRepeater.scala

diff --git a/SPIKE.hash b/SPIKE.hash
index ce15e697..a96811da 100644
--- a/SPIKE.hash
+++ b/SPIKE.hash
@@ -1 +1 @@
-34741e07bc6b56f1762ce579537948d58e28cd5a
+02e2d983cc8e2c385ebe920302c427b9167bd76e
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 3aaa2307..5fa954ee 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 3aaa230733a9eba6edf4d14243d84595e017522f
+Subproject commit 5fa954ee9cf97483cd9c765d9f4c664d1701090d
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index 3e74af93..74f23b4c 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -137,12 +137,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   val (conv_cmd, loop_conv_unroller_busy) = withClock (gated_clock) { LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
     inputType.getWidth, accType.getWidth, dma_maxbytes,
-    new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
+    new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits, pixel_repeats_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new ConfigMvoutRs2(acc_scale_t_bits, 32), new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t),
     new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
     new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
-    has_training_convs, has_max_pool) }
+    has_training_convs, has_max_pool, has_first_layer_optimizations) }
 
   val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization,
     meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index c1cb51ef..5952be5b 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -27,6 +27,7 @@ class StreamReadRequest[U <: Data](spad_rows: Int, acc_rows: Int, mvin_scale_t_b
   val status = new MStatus
   val len = UInt(16.W) // TODO magic number
   val repeats = UInt(16.W) // TODO magic number
+  val pixel_repeats = UInt(8.W) // TODO magic number
   val block_stride = UInt(16.W) // TODO magic number
   val cmd_id = UInt(8.W) // TODO magic number
 
@@ -43,6 +44,8 @@ class StreamReadResponse[U <: Data](spadWidth: Int, accWidth: Int, spad_rows: In
   val has_acc_bitwidth = Bool()
   val scale = UInt(mvin_scale_t_bits.W)
   val repeats = UInt(16.W) // TODO magic number
+  val pixel_repeats = UInt(16.W) // TODO magic number
+  val len = UInt(16.W) // TODO magic number
   val last = Bool()
   val bytes_read = UInt(8.W) // TODO magic number
   val cmd_id = UInt(8.W) // TODO magic number
@@ -100,6 +103,8 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T
     io.resp.bits.has_acc_bitwidth := beatPacker.io.out.bits.has_acc_bitwidth
     io.resp.bits.scale := RegEnable(xactTracker.io.peek.entry.scale, beatPacker.io.req.fire())
     io.resp.bits.repeats := RegEnable(xactTracker.io.peek.entry.repeats, beatPacker.io.req.fire())
+    io.resp.bits.pixel_repeats := RegEnable(xactTracker.io.peek.entry.pixel_repeats, beatPacker.io.req.fire())
+    io.resp.bits.len := RegEnable(xactTracker.io.peek.entry.len, beatPacker.io.req.fire())
     io.resp.bits.cmd_id := RegEnable(xactTracker.io.peek.entry.cmd_id, beatPacker.io.req.fire())
     io.resp.bits.bytes_read := RegEnable(xactTracker.io.peek.entry.bytes_to_read, beatPacker.io.req.fire())
     io.resp.bits.last := beatPacker.io.out.bits.last
@@ -250,6 +255,8 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
     io.reserve.entry.has_acc_bitwidth := req.has_acc_bitwidth
     io.reserve.entry.scale := req.scale
     io.reserve.entry.repeats := req.repeats
+    io.reserve.entry.pixel_repeats := req.pixel_repeats
+    io.reserve.entry.len := req.len
     io.reserve.entry.block_stride := req.block_stride
     io.reserve.entry.lg_len_req := DontCare // TODO just remove this from the IO completely
     io.reserve.entry.bytes_to_read := read_bytes_read
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 45b6a778..be1b084f 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -85,6 +85,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              has_max_pool: Boolean = true,
                                                                              has_nonlinear_activations: Boolean = true,
 
+                                                                             has_first_layer_optimizations: Boolean = true,
+
                                                                              use_firesim_simulation_counters: Boolean = false,
 
                                                                              use_shared_ext_mem: Boolean = false,
@@ -159,7 +161,9 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   val mvout_rows_bits = log2Up(meshRows * tileRows + 1)
 
   val load_states = 3
-  val block_stride_bits = 16
+  val block_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries))
+
+  val pixel_repeats_bits = 8 min log2Up(meshColumns * tileColumns + 1)
 
   val hasIm2Col = false
 
@@ -471,6 +475,10 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       header ++= s"#define ACC_READ_FULL_WIDTH\n"
     header ++= s"\n"
 
+    if (has_first_layer_optimizations) {
+      header ++= "#define HAS_FIRST_LAYER_OPTIMIZATIONS\n\n"
+    }
+
     header ++= s"#endif // $guard\n"
     header.toString()
   }
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index c85b6816..0b28316d 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -24,7 +24,7 @@ object GemminiISA {
   val LOAD3_CMD = 14.U
 
   // TODO add orows and ocols to this as well
-  val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120 | no_pool, downsample, input_dilated, act
+  val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120, max_pixels_per_row | no_pool, downsample, input_dilated, act
   val LOOP_CONV_WS_CONFIG_1 = 16.U // batch_size, in_dim, in_channels, out_channels | out_dim, pool_out_dim, stride, padding
   val LOOP_CONV_WS_CONFIG_2 = 17.U // kernel_dim, pool_size, pool_stride, pool_padding | batches, porows, pocols, pochs
   val LOOP_CONV_WS_CONFIG_3 = 18.U // krows, kcols, kchs, lpad | rpad, upad, dpad, plpad
@@ -95,22 +95,25 @@ object GemminiISA {
   val CONFIG_MVIN_RS1_UNUSED_WIDTH = 2
   val CONFIG_MVIN_RS1_SHRINK_WIDTH = 1
   val CONFIG_MVIN_RS1_STATE_ID_WIDTH = 2
-  val CONFIG_MVIN_RS1_SPACER_WIDTH = (16 - 2 - 1 - 2)
+  val CONFIG_MVIN_RS1_SPACER_WIDTH = 8 - 2 - 1 - 2
+  val CONFIG_MVIN_RS1_PIXEL_REPEAT_WIDTH = 8
   val CONFIG_MVIN_RS1_STRIDE_WIDTH = 16
   val CONFIG_MVIN_RS1_SCALE_WIDTH = 32
 
-  class ConfigMvinRs1(scale_bits: Int, stride_bits: Int) extends Bundle {
-    val _spacer2 = UInt((CONFIG_MVIN_RS1_SCALE_WIDTH - scale_bits).W)
+  class ConfigMvinRs1(scale_bits: Int, stride_bits: Int, pixel_repeat_bits: Int) extends Bundle {
+    val _spacer3 = UInt((CONFIG_MVIN_RS1_SCALE_WIDTH - scale_bits).W)
     val scale = UInt(scale_bits.W)
-    val _spacer1 = UInt((CONFIG_MVIN_RS1_STRIDE_WIDTH - stride_bits).W)
+    val _spacer2 = UInt((CONFIG_MVIN_RS1_STRIDE_WIDTH - stride_bits).W)
     val stride = UInt(stride_bits.W)
+    val _spacer1 = UInt((CONFIG_MVIN_RS1_PIXEL_REPEAT_WIDTH - pixel_repeat_bits).W)
+    val pixel_repeats = UInt(pixel_repeat_bits.W)
     val _spacer0 = UInt(CONFIG_MVIN_RS1_SPACER_WIDTH.W)
     val state_id = UInt(CONFIG_MVIN_RS1_STATE_ID_WIDTH.W)
     val shrink = UInt(CONFIG_MVIN_RS1_SHRINK_WIDTH.W)
     val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W)
 
     override def cloneType: ConfigMvinRs1.this.type =
-      (new ConfigMvinRs1(scale_bits, stride_bits)).asInstanceOf[this.type]
+      (new ConfigMvinRs1(scale_bits, stride_bits, pixel_repeat_bits)).asInstanceOf[this.type]
   }
 
   val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2
diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala
index 89f7be7c..49d7b409 100644
--- a/src/main/scala/gemmini/LoadController.scala
+++ b/src/main/scala/gemmini/LoadController.scala
@@ -34,6 +34,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   val scales = Reg(Vec(load_states, UInt(mvin_scale_t_bits.W)))
   val shrinks = Reg(Vec(load_states, Bool())) // Shrink inputs to accumulator
   val block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W))) // Spad stride during block move-ins
+  val pixel_repeats = Reg(Vec(load_states, UInt(pixel_repeats_bits.W)))
   val block_rows = meshRows * tileRows
   val block_cols = meshColumns * tileColumns
   val row_counter = RegInit(0.U(log2Ceil(block_rows).W))
@@ -47,11 +48,13 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   val rows = mvin_rs2.num_rows
 
   val config_stride = cmd.bits.cmd.rs2
-  val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits))
 
-  val config_scale = config_mvin_rs1.scale // maybe limit width to `mvin_scale_t_bits`?
+  val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits, pixel_repeats_bits))
+
+  val config_scale = config_mvin_rs1.scale
   val config_shrink = config_mvin_rs1.shrink
   val config_block_stride = config_mvin_rs1.stride
+  val config_pixel_repeats = config_mvin_rs1.pixel_repeats
 
   val mstatus = cmd.bits.cmd.status
 
@@ -64,6 +67,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   val scale = scales(state_id)
   val shrink = shrinks(state_id)
   val block_stride = block_strides(state_id)
+  val pixel_repeat = pixel_repeats(state_id)
 
   val all_zeros = vaddr === 0.U
 
@@ -104,6 +108,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   io.dma.req.bits.has_acc_bitwidth := localaddr_plus_row_counter.is_acc_addr && !shrink
   io.dma.req.bits.all_zeros := all_zeros
   io.dma.req.bits.status := mstatus
+  io.dma.req.bits.pixel_repeats := pixel_repeat
 
   // Command tracker IO
   cmd_tracker.io.alloc.valid := control_state === waiting_for_command && cmd.valid && DoLoad
@@ -140,6 +145,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
           scale := config_scale
           shrink := config_shrink
           block_stride := config_block_stride
+          pixel_repeat := Mux(config_pixel_repeats === 0.U, 1.U, config_pixel_repeats) // TODO this default value was just added to maintain backwards compatibility. we should deprecate and remove it later
           cmd.ready := true.B
         }
 
@@ -165,6 +171,10 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
     }
   }
 
+  // Optimizations based on config parameters
+  if (!has_first_layer_optimizations)
+    pixel_repeats.foreach(_ := 1.U)
+
   // Performance counter
   CounterEventIO.init(io.counter)
   io.counter.connectEventSignal(CounterEvent.LOAD_ACTIVE_CYCLE, control_state === sending_rows)
@@ -177,4 +187,5 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
 
   // Assertions
   assert(!(cmd_tracker.io.alloc.fire() && cmd_tracker.io.alloc.bits.bytes_to_read === 0.U), "A single mvin instruction must load more than 0 bytes")
+  assert(has_first_layer_optimizations.B || !(cmd.valid && DoConfig && config_pixel_repeats > 1.U), "If first-layer optimizations are not enabled, then pixel-repeats cannot be greater than 1")
 }
diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala
index cce6bcae..ac5a1f4a 100644
--- a/src/main/scala/gemmini/LocalAddr.scala
+++ b/src/main/scala/gemmini/LocalAddr.scala
@@ -16,6 +16,8 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en
   private val accBankBits = log2Up(acc_banks)
   val accBankRowBits = log2Up(acc_bank_entries)
 
+  val spRows = sp_banks * sp_bank_entries
+
   val is_acc_addr = Bool()
   val accumulate = Bool()
   val read_full_acc_row = Bool()
@@ -71,6 +73,19 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en
     (result, overflow)
   }
 
+  // This function can only be used with non-accumulator addresses. Returns both new address and underflow
+  def floorSub(other: UInt, floor: UInt): (LocalAddr, Bool) = {
+    require(isPow2(sp_bank_entries)) // TODO remove this requirement
+    require(isPow2(acc_bank_entries)) // TODO remove this requirement
+
+    val underflow = data < (floor +& other)
+
+    val result = WireInit(this)
+    result.data := Mux(underflow, floor, data - other)
+
+    (result, underflow)
+  }
+
   def make_this_garbage(dummy: Int = 0): Unit = {
     is_acc_addr := true.B
     accumulate := true.B
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 1f27f3ff..d2775a9c 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -138,6 +138,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
   config_cmd_rs1 := DontCare
   config_cmd_rs1.scale := MVIN_SCALE_IDENTITY
   config_cmd_rs1.stride := req.derived_params.bias_spad_stride
+  config_cmd_rs1.pixel_repeats := 1.U
   config_cmd_rs1.state_id := 2.U
   config_cmd_rs1.shrink := 0.U
   config_cmd_rs1._unused := 1.U
@@ -217,6 +218,7 @@ class LoopConvLdInputReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
   val addr_start = UInt(log2Up(max_acc_addr).W)
   val dram_addr = UInt(coreMaxAddrBits.W)
   val downsample = Bool()
+  val max_pixels_per_row = UInt(small_iterator_bitwidth.W)
   val input_dilated = Bool()
   val trans_input_3120 = Bool()
   val loop_id = UInt(log2Up(concurrent_loops).W)
@@ -310,10 +312,12 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   config_cmd_rs1 := DontCare
   config_cmd_rs1.scale := MVIN_SCALE_IDENTITY
   config_cmd_rs1.stride := input_spad_stride
+  config_cmd_rs1.pixel_repeats := req.max_pixels_per_row
   config_cmd_rs1.state_id := 0.U
   config_cmd_rs1.shrink := 0.U
   config_cmd_rs1._unused := 1.U
   config_cmd.rs1 := config_cmd_rs1.asUInt()
+
   config_cmd.rs2 := dram_stride << req.downsample
 
   val mvin_cmd = Wire(new RoCCCommand)
@@ -476,14 +480,17 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   val config_cmd = Wire(new RoCCCommand)
   config_cmd := DontCare
   config_cmd.inst.funct := CONFIG_CMD
+
   val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType)
   config_cmd_rs1 := DontCare
   config_cmd_rs1.scale := MVIN_SCALE_IDENTITY
   config_cmd_rs1.stride := req.derived_params.weight_spad_stride
+  config_cmd_rs1.pixel_repeats := 1.U
   config_cmd_rs1.state_id := 1.U
   config_cmd_rs1.shrink := 0.U
   config_cmd_rs1._unused := 1.U
   config_cmd.rs1 := config_cmd_rs1.asUInt
+
   config_cmd.rs2 := dram_stride
 
   val mvin_cmd = Wire(new RoCCCommand)
@@ -561,6 +568,7 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi
   val c_addr_start = UInt(log2Up(max_acc_addr).W)
   val wrot180 = Bool()
   val downsample = Bool()
+  val max_pixels_per_row = UInt(small_iterator_bitwidth.W)
   val input_dilated = Bool()
   val trans_weight_0132 = Bool()
   val trans_input_3120 = Bool()
@@ -623,6 +631,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool() ||
     ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool())
 
+  val pixels = Mux(kcols - kcol > req.max_pixels_per_row, req.max_pixels_per_row, kcols - kcol)
+
   val irow = undilated(orow * stride +& krow * kernel_dilation)
   val icol = undilated(ocol * stride +& kcol * kernel_dilation)
 
@@ -630,7 +640,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
     Mux(batches - b > block_size.U, block_size.U, batches - b),
     undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt(), (block_size.U << req.input_dilated).asUInt(), ocols - ocol)))
   val J = Mux(ochs - och > block_size.U, block_size.U, ochs - och)
-  val K = Mux(kchs - kch > block_size.U, block_size.U, kchs - kch)
+  val K = pixels * Mux(kchs - kch > block_size.U, block_size.U, kchs - kch)
 
   // Addresses
   val a_addr = Mux(req.trans_input_3120,
@@ -768,7 +778,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
       val next_b = floorAdd(b, b_it, batches, next_orow === 0.U && next_ocol === 0.U)
       val next_kch = floorAdd(kch, block_size.U, kchs,
         next_b === 0.U && next_orow === 0.U && next_ocol === 0.U)
-      val next_kcol = floorAdd(kcol, 1.U, kcols,
+      val next_kcol = floorAdd(kcol, req.max_pixels_per_row, kcols,
         next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U)
       val next_krow = floorAdd(krow, 1.U, krows,
         next_kcol === 0.U && next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U)
@@ -1049,6 +1059,8 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s
   val trans_weight_0132 = Bool()
   val trans_input_3120 = Bool()
 
+  val max_pixels_per_row = UInt(small_iterator_bitwidth.W)
+
   val configured = Bool()
 
   val running = Bool()
@@ -1137,7 +1149,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
   config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2,
   config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
   compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs,
-  has_training_convs: Boolean, has_max_pool: Boolean)
+  has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean)
   (implicit p: Parameters) extends Module {
   val large_iterator_bitwidth = 16
   val small_iterator_bitwidth = 16 // 8
@@ -1289,6 +1301,12 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
       is (LOOP_CONV_WS) {
         loop_being_configured.no_bias := cmd.bits.rs1(0)
 
+        // TODO we added a default value for max_pixels_per_row just to maintain backwards compatibility. we should deprecate and remove it later
+        val config_max_pixels_per_row = cmd.bits.rs1(15, 8)
+        loop_being_configured.max_pixels_per_row := Mux(
+          !has_first_layer_optimizations.B || config_max_pixels_per_row === 0.U,
+          1.U, config_max_pixels_per_row)
+
         loop_being_configured.wrot180 := has_training_convs.B && cmd.bits.rs1(1)
         loop_being_configured.input_dilated := has_training_convs.B && cmd.bits.rs2(2)
         loop_being_configured.trans_output_1203 := has_training_convs.B && cmd.bits.rs1(2)
@@ -1344,6 +1362,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
   ld_input.io.req.bits.addr_start := loop_requesting_ld_input.a_addr_start
   ld_input.io.req.bits.dram_addr := loop_requesting_ld_input.input_dram_addr
   ld_input.io.req.bits.downsample := loop_requesting_ld_input.downsample
+  ld_input.io.req.bits.max_pixels_per_row := loop_requesting_ld_input.max_pixels_per_row
   ld_input.io.req.bits.input_dilated := loop_requesting_ld_input.input_dilated
   ld_input.io.req.bits.trans_input_3120 := loop_requesting_ld_input.trans_input_3120
   ld_input.io.req.bits.loop_id := loop_requesting_ld_input_id
@@ -1383,6 +1402,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
   ex.io.req.bits.c_addr_start := ex_c_addr_start
   ex.io.req.bits.wrot180 := loop_requesting_ex.wrot180
   ex.io.req.bits.downsample := loop_requesting_ex.downsample
+  ex.io.req.bits.max_pixels_per_row := loop_requesting_ex.max_pixels_per_row
   ex.io.req.bits.input_dilated := loop_requesting_ex.input_dilated
   ex.io.req.bits.trans_weight_0132 := loop_requesting_ex.trans_weight_0132
   ex.io.req.bits.trans_input_3120 := loop_requesting_ex.trans_input_3120
@@ -1465,13 +1485,14 @@ object LoopConv {
             max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int,
             config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2,
             mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
-            compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean)
+            compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean,
+            has_first_layer_optimizations: Boolean)
            (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = {
 
     val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts,
       max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes,
       config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t,
-      compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool))
+      compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool, has_first_layer_optimizations))
 
     mod.io.in <> in
     mod.io.ld_utilization := ld_utilization
diff --git a/src/main/scala/gemmini/PixelRepeater.scala b/src/main/scala/gemmini/PixelRepeater.scala
new file mode 100644
index 00000000..0413304e
--- /dev/null
+++ b/src/main/scala/gemmini/PixelRepeater.scala
@@ -0,0 +1,95 @@
+package gemmini
+
+import chisel3._
+import chisel3.util._
+
+import Util._
+
+class PixelRepeaterReq[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, tag_t: Tag) extends Bundle {
+  val in: Vec[T] = Vec(block_cols, t.cloneType)
+  val mask: Vec[Bool] = Vec(block_cols, Bool())
+  val laddr: LocalAddr = laddr_t.cloneType
+  val len: UInt = UInt(log2Up(block_cols+1).W) // TODO magic number
+  val pixel_repeats: UInt = UInt(8.W) // TODO magic number
+  val last: Bool = Bool()
+  val tag: Tag = tag_t.cloneType
+
+  assert(block_cols <= 255, "len must be longer")
+
+  override def cloneType: PixelRepeaterReq.this.type = new PixelRepeaterReq(t, laddr_t, block_cols, tag_t).asInstanceOf[this.type]
+}
+
+class PixelRepeaterResp[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, tag_t: Tag) extends Bundle {
+  val out: Vec[T] = Vec(block_cols, t.cloneType)
+  val mask: Vec[Bool] = Vec(block_cols, Bool())
+  val laddr: LocalAddr = laddr_t.cloneType
+  val last: Bool = Bool()
+  val tag: Tag = tag_t.cloneType
+
+  override def cloneType: PixelRepeaterResp.this.type = new PixelRepeaterResp(t, laddr_t, block_cols, tag_t).asInstanceOf[this.type]
+}
+
+class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, aligned_to: Int, tag_t: Tag, passthrough: Boolean) extends Module {
+  val io = IO(new Bundle {
+    val req = Flipped(Decoupled(new PixelRepeaterReq(t, laddr_t, block_cols, tag_t)))
+    val resp = Decoupled(new PixelRepeaterResp(t, laddr_t, block_cols, tag_t))
+  })
+
+  if (passthrough) {
+    io.resp.valid := io.req.valid
+    io.resp.bits.out := io.req.bits.in
+    io.resp.bits.mask := io.req.bits.mask
+    io.resp.bits.laddr := io.req.bits.laddr
+    io.resp.bits.last := io.req.bits.last
+    io.resp.bits.tag := io.req.bits.tag
+
+    io.req.ready := io.resp.ready
+  } else {
+    val req = Reg(UDValid(io.req.bits.cloneType))
+
+    io.req.ready := !req.valid || (io.resp.ready && req.bits.pixel_repeats === 0.U)
+
+    val out_shift = Wire(UInt(log2Up(block_cols / 2 + 1).W))
+    out_shift := req.bits.pixel_repeats * req.bits.len
+
+    io.resp.bits.out := (req.bits.in.asUInt() << (out_shift * t.getWidth.U)).asTypeOf(io.resp.bits.out)
+    io.resp.bits.mask := (req.bits.mask.asUInt() << (out_shift * ((t.getWidth / 8) / aligned_to).U)).asTypeOf(io.resp.bits.mask)
+
+    io.resp.bits.last := req.bits.last && (req.bits.pixel_repeats === 0.U)
+    io.resp.bits.tag := req.bits.tag
+
+    val is_acc_addr = req.bits.laddr.is_acc_addr
+    assert(!(req.valid && is_acc_addr && req.bits.pixel_repeats > 0.U))
+
+    val sp_addr = Mux(req.bits.laddr.full_sp_addr() < (laddr_t.spRows / 2).U,
+      req.bits.laddr.floorSub(req.bits.pixel_repeats, 0.U)._1,
+      req.bits.laddr.floorSub(req.bits.pixel_repeats, (laddr_t.spRows / 2).U)._1,
+    )
+
+    val underflow = !is_acc_addr && Mux(req.bits.laddr.full_sp_addr() < (laddr_t.spRows / 2).U,
+      req.bits.laddr.floorSub(req.bits.pixel_repeats, 0.U)._2,
+      req.bits.laddr.floorSub(req.bits.pixel_repeats, (laddr_t.spRows / 2).U)._2,
+    )
+
+    io.resp.bits.laddr := Mux(is_acc_addr, req.bits.laddr, sp_addr)
+
+    io.resp.valid := req.valid && !underflow
+
+    when(io.resp.fire() || underflow) {
+      req.bits.pixel_repeats := req.bits.pixel_repeats - 1.U
+
+      when(req.bits.pixel_repeats === 0.U) {
+        req.pop()
+      }
+    }
+
+    when(io.req.fire()) {
+      req.push(io.req.bits)
+      req.bits.pixel_repeats := io.req.bits.pixel_repeats - 1.U
+    }
+
+    when(reset.toBool()) {
+      req.pop()
+    }
+  }
+}
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index 929685f6..44b992ae 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -115,7 +115,6 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   val solitary_preload = utilization === 1.U && entries.map(e => e.valid && e.bits.cmd.inst.funct === PRELOAD_CMD).reduce(_ || _)
   io.busy := !empty && !(solitary_preload && io.solitary_preload)
 
-
   // Config values set by programmer
   val a_stride = Reg(UInt(16.W)) // TODO magic numbers
   val c_stride = Reg(UInt(16.W)) // TODO magic numbers
@@ -123,6 +122,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   val ld_block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W)))
   val st_block_stride = block_rows.U
   val pooling_is_enabled = Reg(Bool())
+  val ld_pixel_repeats = Reg(Vec(load_states, UInt(8.W))) // This is the ld_pixel_repeat MINUS ONE // TODO magic numbers
 
   val new_entry = Wire(new Entry)
   new_entry := DontCare
@@ -245,6 +245,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       val id = MuxCase(0.U, Seq((new_entry.cmd.inst.funct === LOAD2_CMD) -> 1.U,
         (new_entry.cmd.inst.funct === LOAD3_CMD) -> 2.U))
       val block_stride = ld_block_strides(id)
+      val pixel_repeats = ld_pixel_repeats(id)
 
       val mvin_cols = cmd.rs2(32 + mvin_cols_bits - 1, 32)
       val mvin_rows = cmd.rs2(48 + mvin_rows_bits - 1, 48)
@@ -252,6 +253,18 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       val mvin_mats = mvin_cols / block_cols.U + (mvin_cols % block_cols.U =/= 0.U)
       val total_mvin_rows = ((mvin_mats - 1.U) * block_stride) + mvin_rows
 
+      // TODO We have to know how the LoopConv's internals work here. Our abstractions are leaking
+      if (has_first_layer_optimizations) {
+        val start = cmd.rs2(31, 0).asTypeOf(local_addr_t)
+        // TODO instead of using a floor-sub that's hardcoded to the Scratchpad bank boundaries, we should find some way of letting the programmer specify the start address
+        dst.bits.start := Mux(start.is_acc_addr, start,
+          Mux(start.full_sp_addr() > (local_addr_t.spRows / 2).U,
+            start.floorSub(pixel_repeats, (local_addr_t.spRows / 2).U)._1,
+            start.floorSub(pixel_repeats, 0.U)._1,
+          )
+        )
+      }
+
       dst.bits.end := dst.bits.start + total_mvin_rows
       dst.bits.wraps_around := dst.bits.start.add_with_overflow(total_mvin_rows)._2
     }
@@ -365,7 +378,9 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       }.elsewhen(new_entry.is_config && new_entry.q === ldq) {
         val id = new_entry.cmd.rs1(4,3) // TODO magic numbers
         val block_stride = new_entry.cmd.rs1(31, 16) // TODO magic numbers
+        val repeat_pixels = new_entry.cmd.rs1(15, 8) // TODO magic numbers
         ld_block_strides(id) := block_stride
+        ld_pixel_repeats(id) := repeat_pixels - 1.U
       }.elsewhen(new_entry.is_config && new_entry.q === stq) {
         val pool_stride = new_entry.cmd.rs1(5, 4) // TODO magic numbers
         pooling_is_enabled := pool_stride =/= 0.U
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index 0d76758d..764b5d5a 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -20,6 +20,7 @@ class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits:
   val has_acc_bitwidth = Bool()
   val all_zeros = Bool()
   val block_stride = UInt(16.W) // TODO magic numbers
+  val pixel_repeats = UInt(8.W) // TODO magic numbers
   val cmd_id = UInt(8.W) // TODO don't use a magic number here
   val status = new MStatus
 
@@ -256,7 +257,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     write_issue_q.io.enq.valid := false.B
     write_issue_q.io.enq.bits := write_scale_q.io.deq.bits
 
-
     // Garbage can immediately fire between dispatch_q and scale_q
     when (write_dispatch_q.bits.laddr.is_garbage()) {
       write_scale_q.io.enq <> write_dispatch_q
@@ -266,7 +266,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       write_issue_q.io.enq <> write_scale_q.io.deq
     }
 
-
     val writeData = Wire(Valid(UInt((spad_w max acc_w).W)))
     writeData.valid := write_issue_q.io.deq.bits.laddr.is_garbage()
     writeData.bits := DontCare
@@ -312,7 +311,20 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     zero_writer.io.req.bits.block_stride := io.dma.read.req.bits.block_stride
     zero_writer.io.req.bits.tag := io.dma.read.req.bits
 
-    zero_writer.io.resp.ready := false.B
+    // zero_writer.io.resp.ready := false.B
+
+    val zero_writer_pixel_repeater = Module(new PixelRepeater(inputType, local_addr_t, block_cols, aligned_to, new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), passthrough = !has_first_layer_optimizations))
+    zero_writer_pixel_repeater.io.req.valid := zero_writer.io.resp.valid
+    zero_writer_pixel_repeater.io.req.bits.in := 0.U.asTypeOf(Vec(block_cols, inputType))
+    zero_writer_pixel_repeater.io.req.bits.mask := zero_writer.io.resp.bits.mask
+    zero_writer_pixel_repeater.io.req.bits.laddr := zero_writer.io.resp.bits.laddr
+    zero_writer_pixel_repeater.io.req.bits.len := zero_writer.io.resp.bits.tag.cols
+    zero_writer_pixel_repeater.io.req.bits.pixel_repeats := zero_writer.io.resp.bits.tag.pixel_repeats
+    zero_writer_pixel_repeater.io.req.bits.last := zero_writer.io.resp.bits.last
+    zero_writer_pixel_repeater.io.req.bits.tag := zero_writer.io.resp.bits.tag
+
+    zero_writer.io.resp.ready := zero_writer_pixel_repeater.io.req.ready
+    zero_writer_pixel_repeater.io.resp.ready := false.B
 
     reader.module.io.req.valid := read_issue_q.io.deq.valid
     read_issue_q.io.deq.ready := reader.module.io.req.ready
@@ -321,6 +333,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       read_issue_q.io.deq.bits.laddr.full_acc_addr(), read_issue_q.io.deq.bits.laddr.full_sp_addr())
     reader.module.io.req.bits.len := read_issue_q.io.deq.bits.cols
     reader.module.io.req.bits.repeats := read_issue_q.io.deq.bits.repeats
+    reader.module.io.req.bits.pixel_repeats := read_issue_q.io.deq.bits.pixel_repeats
     reader.module.io.req.bits.scale := read_issue_q.io.deq.bits.scale
     reader.module.io.req.bits.is_acc := read_issue_q.io.deq.bits.laddr.is_acc_addr
     reader.module.io.req.bits.accumulate := read_issue_q.io.deq.bits.laddr.accumulate
@@ -348,10 +361,22 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     mvin_scale_in.bits.in := reader.module.io.resp.bits.data.asTypeOf(chiselTypeOf(mvin_scale_in.bits.in))
     mvin_scale_in.bits.scale := reader.module.io.resp.bits.scale.asTypeOf(mvin_scale_t)
     mvin_scale_in.bits.repeats := reader.module.io.resp.bits.repeats
+    mvin_scale_in.bits.pixel_repeats := reader.module.io.resp.bits.pixel_repeats
     mvin_scale_in.bits.last := reader.module.io.resp.bits.last
     mvin_scale_in.bits.tag := reader.module.io.resp.bits
 
-    mvin_scale_out.ready := false.B
+    val mvin_scale_pixel_repeater = Module(new PixelRepeater(inputType, local_addr_t, block_cols, aligned_to, mvin_scale_out.bits.tag.cloneType, passthrough = !has_first_layer_optimizations))
+    mvin_scale_pixel_repeater.io.req.valid := mvin_scale_out.valid
+    mvin_scale_pixel_repeater.io.req.bits.in := mvin_scale_out.bits.out
+    mvin_scale_pixel_repeater.io.req.bits.mask := mvin_scale_out.bits.tag.mask take mvin_scale_pixel_repeater.io.req.bits.mask.size
+    mvin_scale_pixel_repeater.io.req.bits.laddr := mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row
+    mvin_scale_pixel_repeater.io.req.bits.len := mvin_scale_out.bits.tag.len
+    mvin_scale_pixel_repeater.io.req.bits.pixel_repeats := mvin_scale_out.bits.tag.pixel_repeats
+    mvin_scale_pixel_repeater.io.req.bits.last := mvin_scale_out.bits.last
+    mvin_scale_pixel_repeater.io.req.bits.tag := mvin_scale_out.bits.tag
+
+    mvin_scale_out.ready := mvin_scale_pixel_repeater.io.req.ready
+    mvin_scale_pixel_repeater.io.resp.ready := false.B
 
     if (!mvin_scale_shared) {
       mvin_scale_acc_in.valid := reader.module.io.resp.valid &&
@@ -359,6 +384,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       mvin_scale_acc_in.bits.in := reader.module.io.resp.bits.data.asTypeOf(chiselTypeOf(mvin_scale_acc_in.bits.in))
       mvin_scale_acc_in.bits.scale := reader.module.io.resp.bits.scale.asTypeOf(mvin_scale_acc_t)
       mvin_scale_acc_in.bits.repeats := reader.module.io.resp.bits.repeats
+      mvin_scale_acc_in.bits.pixel_repeats := 1.U
       mvin_scale_acc_in.bits.last := reader.module.io.resp.bits.last
       mvin_scale_acc_in.bits.tag := reader.module.io.resp.bits
 
@@ -368,23 +394,33 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     reader.module.io.resp.ready := Mux(reader.module.io.resp.bits.is_acc && reader.module.io.resp.bits.has_acc_bitwidth,
       mvin_scale_acc_in.ready, mvin_scale_in.ready)
 
-    val mvin_scale_finished = mvin_scale_out.fire() && mvin_scale_out.bits.last
+    // val mvin_scale_finished = mvin_scale_out.fire() && mvin_scale_out.bits.last
+    val mvin_scale_finished = mvin_scale_pixel_repeater.io.resp.fire() && mvin_scale_pixel_repeater.io.resp.bits.last
     val mvin_scale_acc_finished = mvin_scale_acc_out.fire() && mvin_scale_acc_out.bits.last
-    val zero_writer_finished = zero_writer.io.resp.fire() && zero_writer.io.resp.bits.last
+    // val zero_writer_finished = zero_writer.io.resp.fire() && zero_writer.io.resp.bits.last
+    val zero_writer_finished = zero_writer_pixel_repeater.io.resp.fire() && zero_writer_pixel_repeater.io.resp.bits.last
 
+    /*
     val zero_writer_bytes_read = Mux(zero_writer.io.resp.bits.laddr.is_acc_addr,
       zero_writer.io.resp.bits.tag.cols * (accType.getWidth / 8).U,
       zero_writer.io.resp.bits.tag.cols * (inputType.getWidth / 8).U)
+    */
+    val zero_writer_bytes_read = Mux(zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr,
+      zero_writer_pixel_repeater.io.resp.bits.tag.cols * (accType.getWidth / 8).U,
+      zero_writer_pixel_repeater.io.resp.bits.tag.cols * (inputType.getWidth / 8).U)
 
     // For DMA read responses, mvin_scale gets first priority, then mvin_scale_acc, and then zero_writer
     io.dma.read.resp.valid := mvin_scale_finished || mvin_scale_acc_finished || zero_writer_finished
 
-    io.dma.read.resp.bits.cmd_id := MuxCase(zero_writer.io.resp.bits.tag.cmd_id, Seq(
-      mvin_scale_finished -> mvin_scale_out.bits.tag.cmd_id,
+    // io.dma.read.resp.bits.cmd_id := MuxCase(zero_writer.io.resp.bits.tag.cmd_id, Seq(
+    io.dma.read.resp.bits.cmd_id := MuxCase(zero_writer_pixel_repeater.io.resp.bits.tag.cmd_id, Seq(
+      // mvin_scale_finished -> mvin_scale_out.bits.tag.cmd_id,
+      mvin_scale_finished -> mvin_scale_pixel_repeater.io.resp.bits.tag.cmd_id,
       mvin_scale_acc_finished -> mvin_scale_acc_out.bits.tag.cmd_id))
 
     io.dma.read.resp.bits.bytesRead := MuxCase(zero_writer_bytes_read, Seq(
-      mvin_scale_finished -> mvin_scale_out.bits.tag.bytes_read,
+      // mvin_scale_finished -> mvin_scale_out.bits.tag.bytes_read,
+      mvin_scale_finished -> mvin_scale_pixel_repeater.io.resp.bits.tag.bytes_read,
       mvin_scale_acc_finished -> mvin_scale_acc_out.bits.tag.bytes_read))
 
     io.tlb(0) <> writer.module.io.tlb
@@ -465,16 +501,21 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       bank_ios.zipWithIndex.foreach { case (bio, i) =>
         val exwrite = io.srams.write(i).en
 
-        val laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row
+        // val laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row
+        val laddr = mvin_scale_pixel_repeater.io.resp.bits.laddr
 
-        val dmaread = mvin_scale_out.valid && !mvin_scale_out.bits.tag.is_acc &&
+        // val dmaread = mvin_scale_out.valid && !mvin_scale_out.bits.tag.is_acc &&
+        val dmaread = mvin_scale_pixel_repeater.io.resp.valid && !mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc &&
           laddr.sp_bank() === i.U
 
         // We need to make sure that we don't try to return a dma read resp from both zero_writer and either mvin_scale
         // or mvin_acc_scale at the same time. The scalers always get priority in those cases
-        val zerowrite = zero_writer.io.resp.valid && !zero_writer.io.resp.bits.laddr.is_acc_addr &&
-          zero_writer.io.resp.bits.laddr.sp_bank() === i.U &&
-          !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
+        /* val zerowrite = zero_writer.io.resp.valid && !zero_writer.io.resp.bits.laddr.is_acc_addr &&
+          zero_writer.io.resp.bits.laddr.sp_bank() === i.U && */
+        val zerowrite = zero_writer_pixel_repeater.io.resp.valid && !zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr &&
+          zero_writer_pixel_repeater.io.resp.bits.laddr.sp_bank() === i.U &&
+          // !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
+          !((mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
 
         bio.write.en := exwrite || dmaread || zerowrite
 
@@ -484,21 +525,27 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.write.mask := io.srams.write(i).mask
         }.elsewhen (dmaread) {
           bio.write.addr := laddr.sp_row()
-          bio.write.data := mvin_scale_out.bits.out.asUInt()
-          bio.write.mask := mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1)
+          // bio.write.data := mvin_scale_out.bits.out.asUInt()
+          // bio.write.mask := mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1)
+          bio.write.data := mvin_scale_pixel_repeater.io.resp.bits.out.asUInt()
+          bio.write.mask := mvin_scale_pixel_repeater.io.resp.bits.mask take ((spad_w / (aligned_to * 8)) max 1)
 
-          mvin_scale_out.ready := true.B // TODO we combinationally couple valid and ready signals
+          // mvin_scale_out.ready := true.B // TODO we combinationally couple valid and ready signals
+          mvin_scale_pixel_repeater.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals
         }.elsewhen (zerowrite) {
-          bio.write.addr := zero_writer.io.resp.bits.laddr.sp_row()
+          // bio.write.addr := zero_writer.io.resp.bits.laddr.sp_row()
+          bio.write.addr := zero_writer_pixel_repeater.io.resp.bits.laddr.sp_row()
           bio.write.data := 0.U
           bio.write.mask := {
             val n = inputType.getWidth / 8
-            val mask = zero_writer.io.resp.bits.mask
+            // val mask = zero_writer.io.resp.bits.mask
+            val mask = zero_writer_pixel_repeater.io.resp.bits.mask
             val expanded = VecInit(mask.flatMap(e => Seq.fill(n)(e)))
             expanded
           }
 
-          zero_writer.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals
+          // zero_writer.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals
+          zero_writer_pixel_repeater.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals
         }.otherwise {
           bio.write.addr := DontCare
           bio.write.data := DontCare
@@ -636,10 +683,12 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         io.acc.write(i).ready := true.B
         assert(!(exwrite && !bio.write.ready), "Execute controller write to AccumulatorMem was skipped")
 
-        val from_mvin_scale = mvin_scale_out.valid && mvin_scale_out.bits.tag.is_acc
+        // val from_mvin_scale = mvin_scale_out.valid && mvin_scale_out.bits.tag.is_acc
+        val from_mvin_scale = mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc
         val from_mvin_scale_acc = mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.tag.is_acc
 
-        val mvin_scale_laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row
+        // val mvin_scale_laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row
+        val mvin_scale_laddr = mvin_scale_pixel_repeater.io.resp.bits.laddr
         val mvin_scale_acc_laddr = mvin_scale_acc_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_acc_out.bits.row
 
         val dmaread_bank = Mux(from_mvin_scale, mvin_scale_laddr.acc_bank(),
@@ -648,7 +697,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
         // We need to make sure that we don't try to return a dma read resp from both mvin_scale and mvin_scale_acc
         // at the same time. mvin_scale always gets priority in this cases
-        val spad_last = mvin_scale_out.valid && mvin_scale_out.bits.last && !mvin_scale_out.bits.tag.is_acc
+        // val spad_last = mvin_scale_out.valid && mvin_scale_out.bits.last && !mvin_scale_out.bits.tag.is_acc
+        val spad_last = mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last && !mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc
 
         val dmaread = (from_mvin_scale || from_mvin_scale_acc) &&
           dmaread_bank === i.U /* &&
@@ -656,9 +706,13 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
         // We need to make sure that we don't try to return a dma read resp from both zero_writer and either mvin_scale
         // or mvin_acc_scale at the same time. The scalers always get priority in those cases
-        val zerowrite = zero_writer.io.resp.valid && zero_writer.io.resp.bits.laddr.is_acc_addr &&
-          zero_writer.io.resp.bits.laddr.acc_bank() === i.U &&
-          !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
+        /* val zerowrite = zero_writer.io.resp.valid && zero_writer.io.resp.bits.laddr.is_acc_addr &&
+          zero_writer.io.resp.bits.laddr.acc_bank() === i.U && */
+        val zerowrite = zero_writer_pixel_repeater.io.resp.valid && zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr &&
+          zero_writer_pixel_repeater.io.resp.bits.laddr.acc_bank() === i.U &&
+          // !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
+          !((mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last))
+
         val consecutive_write_block = RegInit(false.B)
         if (acc_singleported) {
           val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(acc_sub_banks)).W))
@@ -674,12 +728,15 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         }
         bio.write.valid := false.B
 
-        bio.write.bits.acc := MuxCase(zero_writer.io.resp.bits.laddr.accumulate,
+        // bio.write.bits.acc := MuxCase(zero_writer.io.resp.bits.laddr.accumulate,
+        bio.write.bits.acc := MuxCase(zero_writer_pixel_repeater.io.resp.bits.laddr.accumulate,
           Seq(exwrite -> io.acc.write(i).bits.acc,
-            from_mvin_scale -> mvin_scale_out.bits.tag.accumulate,
+            // from_mvin_scale -> mvin_scale_out.bits.tag.accumulate,
+            from_mvin_scale -> mvin_scale_pixel_repeater.io.resp.bits.tag.accumulate,
             from_mvin_scale_acc -> mvin_scale_acc_out.bits.tag.accumulate))
 
-        bio.write.bits.addr := MuxCase(zero_writer.io.resp.bits.laddr.acc_row(),
+        // bio.write.bits.addr := MuxCase(zero_writer.io.resp.bits.laddr.acc_row(),
+        bio.write.bits.addr := MuxCase(zero_writer_pixel_repeater.io.resp.bits.laddr.acc_row(),
           Seq(exwrite -> io.acc.write(i).bits.addr,
             (from_mvin_scale || from_mvin_scale_acc) -> dmaread_row))
 
@@ -690,20 +747,23 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         }.elsewhen (dmaread && !spad_last && !consecutive_write_block) {
           bio.write.valid := true.B
           bio.write.bits.data := Mux(from_mvin_scale,
-            VecInit(mvin_scale_out.bits.out.map(e => e.withWidthOf(accType))).asTypeOf(acc_row_t),
+            // VecInit(mvin_scale_out.bits.out.map(e => e.withWidthOf(accType))).asTypeOf(acc_row_t),
+            VecInit(mvin_scale_pixel_repeater.io.resp.bits.out.map(e => e.withWidthOf(accType))).asTypeOf(acc_row_t),
             mvin_scale_acc_out.bits.out.asTypeOf(acc_row_t))
           bio.write.bits.mask :=
             Mux(from_mvin_scale,
               {
                 val n = accType.getWidth / inputType.getWidth
-                val mask = mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1)
+                // val mask = mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1)
+                val mask = mvin_scale_pixel_repeater.io.resp.bits.mask take ((spad_w / (aligned_to * 8)) max 1)
                 val expanded = VecInit(mask.flatMap(e => Seq.fill(n)(e)))
                 expanded
               },
               mvin_scale_acc_out.bits.tag.mask)
 
           when(from_mvin_scale) {
-            mvin_scale_out.ready := bio.write.ready
+            // mvin_scale_out.ready := bio.write.ready
+            mvin_scale_pixel_repeater.io.resp.ready := bio.write.ready
           }.otherwise {
             mvin_scale_acc_out.ready := bio.write.ready
           }
@@ -712,12 +772,14 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.write.bits.data := 0.U.asTypeOf(acc_row_t)
           bio.write.bits.mask := {
             val n = accType.getWidth / 8
-            val mask = zero_writer.io.resp.bits.mask
+            // val mask = zero_writer.io.resp.bits.mask
+            val mask = zero_writer_pixel_repeater.io.resp.bits.mask
             val expanded = VecInit(mask.flatMap(e => Seq.fill(n)(e)))
             expanded
           }
 
-          zero_writer.io.resp.ready := bio.write.ready
+          // zero_writer.io.resp.ready := bio.write.ready
+          zero_writer_pixel_repeater.io.resp.ready := bio.write.ready
         }.otherwise {
           bio.write.bits.data := DontCare
           bio.write.bits.mask := DontCare
diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala
index 05480e09..7cb8c14f 100644
--- a/src/main/scala/gemmini/VectorScalarMultiplier.scala
+++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala
@@ -9,6 +9,7 @@ class VectorScalarMultiplierReq[T <: Data, U <: Data, Tag <: Data](block_cols: I
   val in: Vec[T] = Vec(block_cols, t.cloneType)
   val scale: U = u.cloneType
   val repeats: UInt = UInt(16.W) // TODO magic number
+  val pixel_repeats: UInt = UInt(8.W) // TODO magic number
   val last: Bool = Bool()
   val tag: Tag = tag_t.cloneType
 
@@ -81,7 +82,6 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
     in.valid := false.B
   }
 
-
   if (num_scale_units == -1) {
     val pipe = Module(new Pipeline(
       new VectorScalarMultiplierResp(block_cols, t, tag_t),
@@ -144,8 +144,6 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
       tail_oh := (tail_oh << 1) | tail_oh(nEntries-1)
     }
 
-
-
     val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new DataWithIndex(t, u))) }
     for (i <- 0 until nEntries) {
       for (w <- 0 until width) {
@@ -172,7 +170,6 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
         arbOut.valid := false.B
       }
 
-
       val pipe = Module(new ScalePipe(t, mvin_scale_args.get))
       pipe.io.in := arbOut
       val pipe_out = pipe.io.out
@@ -187,14 +184,11 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
         }
       }
     }
+
     when (reset.asBool) {
       regs.foreach(_.valid := false.B)
     }
-
-
   }
-
-
 }
 
 object VectorScalarMultiplier {
diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala
index e8581a26..84821d4e 100644
--- a/src/main/scala/gemmini/XactTracker.scala
+++ b/src/main/scala/gemmini/XactTracker.scala
@@ -15,6 +15,8 @@ class XactTrackerEntry[U <: Data](maxShift: Int, spadWidth: Int, accWidth: Int,
   val has_acc_bitwidth = Bool()
   val scale = UInt(mvin_scale_t_bits.W)
   val repeats = UInt(16.W) // TODO magic number
+  val pixel_repeats = UInt(8.W) // TODO magic number
+  val len = UInt(16.W) // TODO magic number
   val block_stride = UInt(16.W) // TODO magic number
   val spad_row_offset = UInt(log2Up(spadWidth max accWidth).W)
   val lg_len_req = UInt(log2Up(log2Up(maxReqBytes+1)+1).W)

From 022a306db6ef5c8807102aa67ad28fdbcce46f63 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 4 Dec 2021 23:05:25 -0800
Subject: [PATCH 09/11] * Add default value of pixel-repeats to
 ReservationStation.scala * Reduce a few bitwidths in ReservationStation.scala

---
 src/main/scala/gemmini/GemminiConfigs.scala     | 3 +++
 src/main/scala/gemmini/ReservationStation.scala | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index be1b084f..567ef060 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -163,6 +163,9 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   val load_states = 3
   val block_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries))
 
+  val a_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries))
+  val c_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries))
+
   val pixel_repeats_bits = 8 min log2Up(meshColumns * tileColumns + 1)
 
   val hasIm2Col = false
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index 44b992ae..7135969f 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -116,13 +116,13 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   io.busy := !empty && !(solitary_preload && io.solitary_preload)
 
   // Config values set by programmer
-  val a_stride = Reg(UInt(16.W)) // TODO magic numbers
-  val c_stride = Reg(UInt(16.W)) // TODO magic numbers
+  val a_stride = Reg(UInt(a_stride_bits.W))
+  val c_stride = Reg(UInt(c_stride_bits.W))
   val a_transpose = Reg(Bool())
   val ld_block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W)))
   val st_block_stride = block_rows.U
   val pooling_is_enabled = Reg(Bool())
-  val ld_pixel_repeats = Reg(Vec(load_states, UInt(8.W))) // This is the ld_pixel_repeat MINUS ONE // TODO magic numbers
+  val ld_pixel_repeats = Reg(Vec(load_states, UInt(pixel_repeats_bits.W))) // This is the ld_pixel_repeat MINUS ONE
 
   val new_entry = Wire(new Entry)
   new_entry := DontCare
@@ -378,7 +378,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       }.elsewhen(new_entry.is_config && new_entry.q === ldq) {
         val id = new_entry.cmd.rs1(4,3) // TODO magic numbers
         val block_stride = new_entry.cmd.rs1(31, 16) // TODO magic numbers
-        val repeat_pixels = new_entry.cmd.rs1(15, 8) // TODO magic numbers
+        val repeat_pixels = maxOf(new_entry.cmd.rs1(8 + pixel_repeats_bits - 1, 8), 1.U) // TODO we use a default value of pixel repeats here, for backwards compatibility. However, we should deprecate and remove this default value eventually
         ld_block_strides(id) := block_stride
         ld_pixel_repeats(id) := repeat_pixels - 1.U
       }.elsewhen(new_entry.is_config && new_entry.q === stq) {

From 3efa8917a86040228969e7c19e06f814f7ed1587 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Mon, 6 Dec 2021 00:07:07 -0800
Subject: [PATCH 10/11] Fix tlb hit counter (#168)

Fixes bug where TLB hits were being counted incorrectly.

Prior to this PR, we were using RegNext(io.req.fire()) to match TLB requests to TLB responses. However, we made our interface to the TLB combinational months ago, so the RegNext is no longer necessary (and is actually incorrect).
---
 src/main/scala/gemmini/FrontendTLB.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index 269409fc..bc028ee9 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -66,12 +66,12 @@ class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters:
   assert(!io.exp.flush_retry || !io.exp.flush_skip, "TLB: flushing with both retry and skip at same time")
 
   CounterEventIO.init(io.counter)
-  io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, RegNext(io.req.fire()) && !tlb.io.resp.miss)
+  io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, io.req.fire() && !tlb.io.resp.miss)
   io.counter.connectEventSignal(CounterEvent.DMA_TLB_TOTAL_REQ, io.req.fire())
   io.counter.connectEventSignal(CounterEvent.DMA_TLB_MISS_CYCLE, tlb.io.resp.miss)
 
   if (use_firesim_simulation_counters) {
-    PerfCounter(RegNext(io.req.fire()) && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits")
+    PerfCounter(io.req.fire() && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits")
     PerfCounter(io.req.fire(), "tlb_reqs", "total number of tlb reqs")
     PerfCounter(tlb.io.resp.miss, "tlb_miss_cycles", "total number of cycles where the tlb is resolving a miss")
   }

From 56e85a2075a96ca2c14f3e4108a3b3a870bdee46 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Mon, 6 Dec 2021 14:02:45 -0800
Subject: [PATCH 11/11] Rename tiled_conv_A_stride to tiled_conv and cleanup
 unused conv implementations (#169)

Rename tiled_conv_A_stride to tiled_conv and cleanup unused conv implementations
---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 5fa954ee..21713ec6 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 5fa954ee9cf97483cd9c765d9f4c664d1701090d
+Subproject commit 21713ec6e9dbbf2477b092e04eb8970776a5da72