From eab7ca086bfa1274ed68d17d6e2ee6c43f5fd4e0 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Tue, 26 Oct 2021 01:32:07 -0700 Subject: [PATCH 01/11] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3f81518f..b90bed69 100644 --- a/README.md +++ b/README.md @@ -520,3 +520,4 @@ If Gemmini helps you in your academic research, you are encouraged to cite our p # Acknowledgements - The Gemmini [logo](./img/full-logo.svg) was designed by Dima Nikiforov ([@CobbledSteel](https://github.com/CobbledSteel)). +- This project was, in part, funded by the U.S. Government under the DARPA RTML program (contract FA8650-20-2-7006). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the U.S. Government. From af73a9517f1c60870dbce977791157f8be326e7b Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Tue, 26 Oct 2021 09:23:09 -0700 Subject: [PATCH 02/11] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b90bed69..428a0dad 100644 --- a/README.md +++ b/README.md @@ -519,5 +519,5 @@ If Gemmini helps you in your academic research, you are encouraged to cite our p # Acknowledgements -- The Gemmini [logo](./img/full-logo.svg) was designed by Dima Nikiforov ([@CobbledSteel](https://github.com/CobbledSteel)). - This project was, in part, funded by the U.S. Government under the DARPA RTML program (contract FA8650-20-2-7006). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the U.S. Government. +- The Gemmini [logo](./img/full-logo.svg) was designed by Dima Nikiforov ([@CobbledSteel](https://github.com/CobbledSteel)). From d42745c70499acdf255a5f5929986237b812c8ad Mon Sep 17 00:00:00 2001 From: "Ruohan (Richard) Yan" Date: Thu, 28 Oct 2021 21:53:39 -0700 Subject: [PATCH 03/11] Gemmini ISA Bundles (#149) * wip load/store * created & parameterized bundles * fix config ex rs1 * optimize loopconv & loopmatmul with bundles; add bundles for preload and compute * move assignments to pipeline output Co-authored-by: Ruohan Yan --- src/main/scala/gemmini/Controller.scala | 12 +- .../scala/gemmini/ExecuteController.scala | 22 +- src/main/scala/gemmini/GemminiISA.scala | 170 ++++++++++++++ src/main/scala/gemmini/LoadController.scala | 23 +- src/main/scala/gemmini/LoopConv.scala | 211 ++++++++++++++---- src/main/scala/gemmini/LoopMatmul.scala | 99 ++++++-- src/main/scala/gemmini/StoreController.scala | 54 ++--- 7 files changed, 481 insertions(+), 110 deletions(-) diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index cdb708f1..e63d7451 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -127,7 +127,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // TODO replace 4,12,2 with parameters based on ROB size val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, - inputType.getWidth, accType.getWidth, dma_maxbytes) + inputType.getWidth, accType.getWidth, dma_maxbytes, + new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), + new ConfigMvoutRs2(acc_scale_t_bits, 32), new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t), + new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), + new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), + new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t)) // val (compressed_cmd, compressor_busy) = InstCompressor(unrolled_cmd) // compressed_cmd.ready := false.B @@ -136,7 +141,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, - inputType.getWidth, accType.getWidth, dma_maxbytes) + inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), + new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), + new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), + new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)) val unrolled_cmd = Queue(loop_cmd) unrolled_cmd.ready := false.B diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 2b790b7b..db9a894e 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -538,27 +538,29 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In when(cmd.valid(0)) { when(DoConfig && !matmul_in_progress && !pending_completed_rob_ids.map(_.valid).reduce(_ || _)) { + val config_ex_rs1 = rs1s(0).asTypeOf(new ConfigExRs1(acc_scale_t_bits)) + val config_ex_rs2 = rs2s(0).asTypeOf(new ConfigExRs2) + val config_cmd_type = rs1s(0)(1,0) // TODO magic numbers when (config_cmd_type === CONFIG_EX) { - val set_only_strides = rs1s(0)(7) // TODO magic number + val set_only_strides = config_ex_rs1.set_only_strides when (!set_only_strides) { - activation := rs1s(0)(4, 3) // TODO magic number - in_shift := rs2s(0)(31, 0) // TODO magic number + activation := config_ex_rs1.activation + in_shift := config_ex_rs2.in_shift acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_args.multiplicand_t) // TODO magic number - relu6_shift := rs2s(0)(47, 32) // TODO magic number - a_transpose := rs1s(0)(8) // TODO magic number - bd_transpose := rs1s(0)(9) // TODO magic number + relu6_shift := config_ex_rs2.relu6_shift + a_transpose := config_ex_rs1.a_transpose + bd_transpose := config_ex_rs1.b_transpose if (dataflow == Dataflow.BOTH) { - current_dataflow := rs1s(0)(2) // TODO magic number + current_dataflow := config_ex_rs1.dataflow } } - a_addr_stride := rs1s(0)(31, 16) // TODO magic number // TODO this needs to be kept in sync with ROB.scala - c_addr_stride := rs2s(0)(63, 48) // TODO magic number // TODO this needs to be kept in sync with ROB.scala - + a_addr_stride := config_ex_rs1.a_stride // TODO this needs to be kept in sync with ROB.scala + c_addr_stride := config_ex_rs2.c_stride // TODO this needs to be kept in sync with ROB.scala config_initialized := true.B }.otherwise { // config_cmd_type === CONFIG_IM2COL ocol := cmd.bits(0).cmd.rs2(63, 56) diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala index f9d7a1ba..554bcdeb 100644 --- a/src/main/scala/gemmini/GemminiISA.scala +++ b/src/main/scala/gemmini/GemminiISA.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -56,4 +57,173 @@ object GemminiISA { // dataflow configuration //========================================================================== val GARBAGE_ADDR = "hffffffff".U(32.W) + + val MVIN_RS2_ADDR_WIDTH = 32 + val MVIN_RS2_COLS_WIDTH = 16 + val MVIN_RS2_ROWS_WIDTH = 16 + + class MvinRs2(mvin_rows_bits: Int, mvin_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle { + val _spacer2 = UInt((MVIN_RS2_ROWS_WIDTH - mvin_rows_bits).W) + val num_rows = UInt(mvin_rows_bits.W) + val _spacer1 = UInt((MVIN_RS2_COLS_WIDTH - mvin_cols_bits).W) + val num_cols = UInt(mvin_cols_bits.W) + val _spacer0 = UInt((MVIN_RS2_ADDR_WIDTH - local_addr_t.getWidth).W) + val local_addr = local_addr_t.cloneType + + override def cloneType: MvinRs2.this.type = + (new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t)).asInstanceOf[this.type] + } + + val MVOUT_RS2_ADDR_WIDTH = 32 + val MVOUT_RS2_COLS_WIDTH = 16 + val MVOUT_RS2_ROWS_WIDTH = 16 + + class MvoutRs2(mvout_rows_bits: Int, mvout_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle { + val _spacer2 = UInt((MVOUT_RS2_ROWS_WIDTH - mvout_rows_bits).W) + val num_rows = UInt(mvout_rows_bits.W) + val _spacer1 = UInt((MVOUT_RS2_COLS_WIDTH - mvout_cols_bits).W) + val num_cols = UInt(mvout_cols_bits.W) + val _spacer0 = UInt((MVOUT_RS2_ADDR_WIDTH - local_addr_t.getWidth).W) + val local_addr = local_addr_t.cloneType + + override def cloneType: MvoutRs2.this.type = + (new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)).asInstanceOf[this.type] + } + + val CONFIG_MVIN_RS1_UNUSED_WIDTH = 2 + val CONFIG_MVIN_RS1_SHRINK_WIDTH = 1 + val CONFIG_MVIN_RS1_STATE_ID_WIDTH = 2 + val CONFIG_MVIN_RS1_SPACER_WIDTH = (16 - 2 - 1 - 2) + val CONFIG_MVIN_RS1_STRIDE_WIDTH = 16 + val CONFIG_MVIN_RS1_SCALE_WIDTH = 32 + + class ConfigMvinRs1(scale_bits: Int, stride_bits: Int) extends Bundle { + val _spacer2 = UInt((CONFIG_MVIN_RS1_SCALE_WIDTH - scale_bits).W) + val scale = UInt(scale_bits.W) + val _spacer1 = UInt((CONFIG_MVIN_RS1_STRIDE_WIDTH - stride_bits).W) + val stride = UInt(stride_bits.W) + val _spacer0 = UInt(CONFIG_MVIN_RS1_SPACER_WIDTH.W) + val state_id = UInt(CONFIG_MVIN_RS1_STATE_ID_WIDTH.W) + val shrink = UInt(CONFIG_MVIN_RS1_SHRINK_WIDTH.W) + val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W) + + override def cloneType: ConfigMvinRs1.this.type = + (new ConfigMvinRs1(scale_bits, stride_bits)).asInstanceOf[this.type] + } + + val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2 + val CONFIG_MVOUT_RS1_ACTIVATION_WIDTH = 2 + val CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH = 2 + val CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH = 2 + val CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH = 2 + val CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH = 2 + val CONFIG_MVOUT_RS1_SPACER_WIDTH = (24 - 2 * 6) + val CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH = 8 + val CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH = 8 + val CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH = 8 + val CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH = 8 + val CONFIG_MVOUT_RS1_OUT_COLS_WIDTH = 8 + + class ConfigMvoutRs1 extends Bundle { + val ocols = UInt(CONFIG_MVOUT_RS1_OUT_COLS_WIDTH.W) + val orows = UInt(CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH.W) + val pocols = UInt(CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH.W) + val porows = UInt(CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH.W) + val pool_out_dim = UInt(CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH.W) + val _spacer = UInt(CONFIG_MVOUT_RS1_SPACER_WIDTH.W) + val lpad = UInt(CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH.W) + val upad = UInt(CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH.W) + val pool_size = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W) + val pool_stride = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W) + val activation = UInt(CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W) + val _unused = UInt(CONFIG_MVOUT_RS1_UNUSED_WIDTH.W) + + override def cloneType: ConfigMvoutRs1.this.type = (new ConfigMvoutRs1).asInstanceOf[this.type] + } + + val CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH = 32 + val CONFIG_MVOUT_RS2_STRIDE_WIDTH = 32 + + class ConfigMvoutRs2(acc_scale_bits: Int, stride_bits: Int) extends Bundle { + val _spacer1 = UInt((CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH - acc_scale_bits).W) + val acc_scale = UInt(acc_scale_bits.W) + val _spacer0 = UInt((CONFIG_MVOUT_RS2_STRIDE_WIDTH - stride_bits).W) + val stride = UInt(stride_bits.W) + + override def cloneType: ConfigMvoutRs2.this.type = + (new ConfigMvoutRs2(acc_scale_bits, stride_bits)).asInstanceOf[this.type] + } + + val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2 + val CONFIG_EX_RS1_DATAFLOW_WIDTH = 1 + val CONFIG_EX_RS1_ACTIVATION_WIDTH = 2 + val CONFIG_EX_RS1_SPACER0_WIDTH = (7 - 2 - 1 - 2) + val CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH = 1 + val CONFIG_EX_RS1_A_TRANSPOSE_WIDTH = 1 + val CONFIG_EX_RS1_B_TRANSPOSE_WIDTH = 1 + val CONFIG_EX_RS1_SPACER1_WIDTH = (16 - 10) + val CONFIG_EX_RS1_A_STRIDE_WIDTH = 16 + val CONFIG_EX_RS1_ACC_SCALE_WIDTH = 32 + + class ConfigExRs1(acc_scale_bits: Int) extends Bundle { + val _spacer2 = UInt((CONFIG_EX_RS1_ACC_SCALE_WIDTH - acc_scale_bits).W) + val acc_scale = UInt(acc_scale_bits.W) + val a_stride = UInt(CONFIG_EX_RS1_A_STRIDE_WIDTH.W) + val _spacer1 = UInt(CONFIG_EX_RS1_SPACER1_WIDTH.W) + val b_transpose = UInt(CONFIG_EX_RS1_B_TRANSPOSE_WIDTH.W) + val a_transpose = UInt(CONFIG_EX_RS1_A_TRANSPOSE_WIDTH.W) + val set_only_strides = UInt(CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH.W) + val _spacer0 = UInt(CONFIG_EX_RS1_SPACER0_WIDTH.W) + val activation = UInt(CONFIG_EX_RS1_ACTIVATION_WIDTH.W) + val dataflow = UInt(CONFIG_EX_RS1_DATAFLOW_WIDTH.W) + val cmd_type = UInt(CONFIG_EX_RS1_CMD_TYPE_WIDTH.W) + + override def cloneType: ConfigExRs1.this.type = + (new ConfigExRs1(acc_scale_bits)).asInstanceOf[this.type] + } + + val CONFIG_EX_RS2_IN_SHIFT_WIDTH = 32 + val CONFIG_EX_RS2_RELU6_SHIFT_WIDTH = 16 + val CONFIG_EX_RS2_C_STRIDE_WIDTH = 16 + + class ConfigExRs2 extends Bundle { + val c_stride = UInt(CONFIG_EX_RS2_C_STRIDE_WIDTH.W) + val relu6_shift = UInt(CONFIG_EX_RS2_RELU6_SHIFT_WIDTH.W) + val in_shift = UInt(CONFIG_EX_RS2_IN_SHIFT_WIDTH.W) + + override def cloneType: ConfigExRs2.this.type = (new ConfigExRs2).asInstanceOf[this.type] + } + + val PRELOAD_RS_ADDR_WIDTH = 32 + val PRELOAD_RS_COLS_WIDTH = 16 + val PRELOAD_RS_ROWS_WIDTH = 16 + + class PreloadRs(preload_rows_bits: Int, preload_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle { + val _spacer2 = UInt((PRELOAD_RS_ROWS_WIDTH - preload_rows_bits).W) + val num_rows = UInt(preload_rows_bits.W) + val _spacer1 = UInt((PRELOAD_RS_COLS_WIDTH - preload_cols_bits).W) + val num_cols = UInt(preload_cols_bits.W) + val _spacer0 = UInt((PRELOAD_RS_ADDR_WIDTH - local_addr_t.getWidth).W) + val local_addr = local_addr_t.cloneType + + override def cloneType: PreloadRs.this.type = + (new PreloadRs(preload_rows_bits, preload_cols_bits, local_addr_t)).asInstanceOf[this.type] + } + + val COMPUTED_RS_ADDR_WIDTH = 32 + val COMPUTED_RS_COLS_WIDTH = 16 + val COMPUTED_RS_ROWS_WIDTH = 16 + + class ComputeRs(compute_rows_bits: Int, compute_cols_bits: Int, local_addr_t: LocalAddr) extends Bundle { + val _spacer2 = UInt((COMPUTED_RS_ROWS_WIDTH - compute_rows_bits).W) + val num_rows = UInt(compute_rows_bits.W) + val _spacer1 = UInt((COMPUTED_RS_COLS_WIDTH - compute_cols_bits).W) + val num_cols = UInt(compute_cols_bits.W) + val _spacer0 = UInt((COMPUTED_RS_ADDR_WIDTH - local_addr_t.getWidth).W) + val local_addr = local_addr_t.cloneType + + override def cloneType: ComputeRs.this.type = + (new ComputeRs(compute_rows_bits, compute_cols_bits, local_addr_t)).asInstanceOf[this.type] + } } + diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala index ccf26fc0..1c8b0ced 100644 --- a/src/main/scala/gemmini/LoadController.scala +++ b/src/main/scala/gemmini/LoadController.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -8,7 +9,8 @@ import freechips.rocketchip.config.Parameters // TODO we need to check for WAW errors here // TODO deal with errors when reading scratchpad responses -class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int, local_addr_t: LocalAddr) +class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int, + local_addr_t: LocalAddr) (implicit p: Parameters) extends Module { import config._ @@ -36,20 +38,25 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) val cmd = Queue(io.cmd, ld_queue_length) + val vaddr = cmd.bits.cmd.rs1 - val localaddr = cmd.bits.cmd.rs2.asTypeOf(local_addr_t) - val cols = cmd.bits.cmd.rs2(32 + mvin_cols_bits - 1, 32) // TODO magic numbers - val rows = cmd.bits.cmd.rs2(48 + mvin_rows_bits - 1, 48) // TODO magic numbers + val mvin_rs2 = cmd.bits.cmd.rs2.asTypeOf(new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t)) + val localaddr = mvin_rs2.local_addr + val cols = mvin_rs2.num_cols + val rows = mvin_rs2.num_rows + val config_stride = cmd.bits.cmd.rs2 - val config_scale = cmd.bits.cmd.rs1(32 + mvin_scale_t_bits - 1, 32) // TODO magic numbers - val config_shrink = cmd.bits.cmd.rs1(2) // TODO magic numbers - val config_block_stride = cmd.bits.cmd.rs1(31, 16) // TODO magic numbers + val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits)) + + val config_scale = config_mvin_rs1.scale // maybe limit width to `mvin_scale_t_bits`? + val config_shrink = config_mvin_rs1.shrink + val config_block_stride = config_mvin_rs1.stride val mstatus = cmd.bits.cmd.status val load_state_id = MuxCase(0.U, Seq((cmd.bits.cmd.inst.funct === LOAD2_CMD) -> 1.U, (cmd.bits.cmd.inst.funct === LOAD3_CMD) -> 2.U)) - val config_state_id = cmd.bits.cmd.rs1(4,3) // TODO magic numbers + val config_state_id = config_mvin_rs1.state_id val state_id = Mux(cmd.bits.cmd.inst.funct === CONFIG_CMD, config_state_id, load_state_id) val stride = strides(state_id) diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index f39faee1..749f00fe 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -74,7 +74,8 @@ class LoopConvLdBiasReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: I } class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_acc_addr: Int, acc_w: Int, - max_block_len_acc: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module { + max_block_len_acc: Int, concurrent_loops: Int, latency: Int, + config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module { val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow val io = IO(new Bundle { val req = Flipped(Decoupled(new LoopConvLdBiasReq(coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth: Int, max_acc_addr, concurrent_loops))) @@ -131,14 +132,23 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi val config_cmd = Wire(new RoCCCommand) config_cmd := DontCare config_cmd.inst.funct := CONFIG_CMD - config_cmd.rs1 := (MVIN_SCALE_IDENTITY << 32.U) | (req.derived_params.bias_spad_stride << 16.U) | (2.U << 3) | 1.U + + val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType) + config_cmd_rs1 := DontCare + config_cmd_rs1.scale := MVIN_SCALE_IDENTITY + config_cmd_rs1.stride := req.derived_params.bias_spad_stride + config_cmd_rs1.state_id := 2.U + config_cmd_rs1.shrink := 0.U + config_cmd_rs1._unused := 1.U + config_cmd.rs1 := config_cmd_rs1.asUInt + config_cmd.rs2 := 0.U val mvin_cmd = Wire(new RoCCCommand) mvin_cmd := DontCare mvin_cmd.inst.funct := LOAD3_CMD - mvin_cmd.rs1 := 0.U //dram_addr - mvin_cmd.rs2 := 0.U //(I << 48.U) | (J << 32.U) | spad_addr + mvin_cmd.rs1 := 0.U + mvin_cmd.rs2 := 0.U // Inputs and outputs io.req.ready := state === idle && !command_p.io.busy @@ -158,7 +168,12 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi when (command_p.io.out.bits.cmd.inst.funct === LOAD3_CMD) { val o = command_p.io.out.bits io.cmd.bits.rs1 := o.dram_addr - io.cmd.bits.rs2 := (o.I << 48.U) | (o.J << 32.U) | o.spad_addr + val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) + mvin_cmd_rs2 := DontCare + mvin_cmd_rs2.num_rows := o.I.asUInt() + mvin_cmd_rs2.num_cols := o.J.asUInt() + mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr) + io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() } // Sending outputs @@ -207,7 +222,8 @@ class LoopConvLdInputReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: } class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, - max_block_len: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module { + max_block_len: Int, concurrent_loops: Int, latency: Int, + config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module { val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow val io = IO(new Bundle { @@ -287,14 +303,22 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw val config_cmd = Wire(new RoCCCommand) config_cmd := DontCare config_cmd.inst.funct := CONFIG_CMD - config_cmd.rs1 := (MVIN_SCALE_IDENTITY << 32.U) | (input_spad_stride << 16.U) | (0.U << 3) | 1.U + + val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType) + config_cmd_rs1 := DontCare + config_cmd_rs1.scale := MVIN_SCALE_IDENTITY + config_cmd_rs1.stride := input_spad_stride + config_cmd_rs1.state_id := 0.U + config_cmd_rs1.shrink := 0.U + config_cmd_rs1._unused := 1.U + config_cmd.rs1 := config_cmd_rs1.asUInt() config_cmd.rs2 := dram_stride << req.downsample val mvin_cmd = Wire(new RoCCCommand) mvin_cmd := DontCare mvin_cmd.inst.funct := LOAD_CMD - mvin_cmd.rs1 := 0.U //dram_addr - mvin_cmd.rs2 := 0.U // ((I >> req.downsample) << 48.U).asUInt() | (K << 32.U).asUInt() | spad_addr.asUInt() + mvin_cmd.rs1 := 0.U // dram_addr + mvin_cmd.rs2 := 0.U // mvin_cmd_rs2 // Inputs and outputs io.req.ready := state === idle && !command_p.io.busy @@ -314,7 +338,12 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw when (command_p.io.out.bits.cmd.inst.funct === LOAD_CMD) { val o = command_p.io.out.bits io.cmd.bits.rs1 := o.dram_addr - io.cmd.bits.rs2 := ((o.I >> req.downsample) << 48).asUInt | (o.K << 32).asUInt | o.spad_addr.asUInt + val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) + mvin_cmd_rs2 := DontCare + mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt() + mvin_cmd_rs2.num_cols := o.K.asUInt() + mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr) + io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() } // Sending outputs @@ -366,7 +395,8 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: } class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, - max_block_len: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module { + max_block_len: Int, concurrent_loops: Int, latency: Int, + config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module { val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow val io = IO(new Bundle { @@ -444,14 +474,21 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit val config_cmd = Wire(new RoCCCommand) config_cmd := DontCare config_cmd.inst.funct := CONFIG_CMD - config_cmd.rs1 := (MVIN_SCALE_IDENTITY << 32.U) | (req.derived_params.weight_spad_stride << 16.U) | (1.U << 3) | 1.U + val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType) + config_cmd_rs1 := DontCare + config_cmd_rs1.scale := MVIN_SCALE_IDENTITY + config_cmd_rs1.stride := req.derived_params.weight_spad_stride + config_cmd_rs1.state_id := 1.U + config_cmd_rs1.shrink := 0.U + config_cmd_rs1._unused := 1.U + config_cmd.rs1 := config_cmd_rs1.asUInt config_cmd.rs2 := dram_stride val mvin_cmd = Wire(new RoCCCommand) mvin_cmd := DontCare mvin_cmd.inst.funct := LOAD2_CMD - mvin_cmd.rs1 := 0.U//dram_addr - mvin_cmd.rs2 := 0.U//(K << 48.U) | (J << 32.U) | spad_addr + mvin_cmd.rs1 := 0.U // dram_addr + mvin_cmd.rs2 := 0.U // mvin_cmd_rs2 // Inputs and outputs io.req.ready := state === idle && !command_p.io.busy @@ -471,7 +508,12 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit when (command_p.io.out.bits.cmd.inst.funct === LOAD2_CMD) { val o = command_p.io.out.bits io.cmd.bits.rs1 := o.dram_addr - io.cmd.bits.rs2 := (o.K << 48) | (o.J << 32) | o.spad_addr + val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) + mvin_cmd_rs2 := DontCare + mvin_cmd_rs2.num_rows := o.K + mvin_cmd_rs2.num_cols := o.J + mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr) + io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() } // Sending outputs @@ -524,7 +566,9 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi } class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, - max_acc_addr: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module { + max_acc_addr: Int, concurrent_loops: Int, latency: Int, + config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs)(implicit p: Parameters) extends Module { val GARBAGE_ADDR = (~0.U(32.W)).asUInt() val io = IO(new Bundle { @@ -623,16 +667,27 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val config_cmd = Wire(new RoCCCommand) config_cmd := DontCare config_cmd.inst.funct := CONFIG_CMD - config_cmd.rs1 := ((irows * icols) << 16) | (1.U << 7) - config_cmd.rs2 := (orows * ocols) << 48 - val pre_cmd = Wire(new RoCCCommand) + val config_cmd_rs1 = Wire(config_ex_rs1_t.cloneType) + config_cmd_rs1 := DontCare + config_cmd_rs1.a_stride := (irows * icols).asUInt() + config_cmd_rs1.set_only_strides := 1.U + config_cmd_rs1.cmd_type := 0.U + + val config_cmd_rs2 = Wire(new ConfigExRs2) + config_cmd_rs2 := DontCare + config_cmd_rs2.c_stride := (orows * ocols).asUInt() + + config_cmd.rs1 := config_cmd_rs1.asUInt() + config_cmd.rs2 := config_cmd_rs2.asUInt() + + val pre_cmd = Wire(new RoCCCommand) // preload pre_cmd := DontCare pre_cmd.inst.funct := PRELOAD_CMD pre_cmd.rs1 := 0.U//(K << 48) | (J << 32) | pre_addr pre_cmd.rs2 := 0.U//(I << 48) | (J << 32) | c_addr - val comp_cmd = Wire(new RoCCCommand()) + val comp_cmd = Wire(new RoCCCommand()) // compute.preloaded comp_cmd := DontCare comp_cmd.inst.funct := Mux(new_weights, COMPUTE_AND_FLIP_CMD, COMPUTE_AND_STAY_CMD) comp_cmd.rs1 := 0.U//(I << 48) | (K << 32) | a_addr @@ -659,12 +714,36 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera io.cmd.bits := command_p.io.out.bits.cmd when (command_p.io.out.bits.cmd.inst.funct === PRELOAD_CMD) { val o = command_p.io.out.bits - io.cmd.bits.rs1 := (o.K << 48) | (o.J << 32) | o.pre_addr - io.cmd.bits.rs2 := (o.I << 48) | (o.J << 32) | o.c_addr + val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType) + pre_cmd_rs1 := DontCare + pre_cmd_rs1.num_rows := o.K.asUInt() + pre_cmd_rs1.num_cols := o.J.asUInt() + pre_cmd_rs1.local_addr := o.pre_addr.asTypeOf(pre_cmd_rs1.local_addr) + + val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType) + pre_cmd_rs2 := DontCare + pre_cmd_rs2.num_rows := o.I.asUInt() + pre_cmd_rs2.num_cols := o.J.asUInt() + pre_cmd_rs2.local_addr := o.c_addr.asTypeOf(pre_cmd_rs2.local_addr) + + io.cmd.bits.rs1 := pre_cmd_rs1.asUInt() + io.cmd.bits.rs2 := pre_cmd_rs2.asUInt() }.elsewhen(command_p.io.out.bits.cmd.inst.funct =/= CONFIG_CMD) { val o = command_p.io.out.bits - io.cmd.bits.rs1 := (o.I << 48) | (o.K << 32) | o.a_addr - io.cmd.bits.rs2 := (o.I << 48) | (o.J << 32) | GARBAGE_ADDR + val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType) + comp_cmd_rs1 := DontCare + comp_cmd_rs1.num_rows := o.I.asUInt() + comp_cmd_rs1.num_cols := o.K.asUInt() + comp_cmd_rs1.local_addr := o.a_addr.asTypeOf(comp_cmd_rs1.local_addr) + + val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType) + comp_cmd_rs2 := DontCare + comp_cmd_rs2.num_rows := o.I.asUInt() + comp_cmd_rs2.num_cols := o.J.asUInt() + comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr) + + io.cmd.bits.rs1 := comp_cmd_rs1.asUInt() + io.cmd.bits.rs2 := comp_cmd_rs2.asUInt() } // Updating "new_weights" @@ -741,7 +820,7 @@ class LoopConvStReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: Int, val loop_id = UInt(log2Up(concurrent_loops).W) } -class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, concurrent_loops: Int, latency: Int)(implicit p: Parameters) extends Module { +class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, concurrent_loops: Int, latency: Int, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2)(implicit p: Parameters) extends Module { val ACC_SCALE_NO_CHANGE = ~(0.U(32.W)) // TODO get this from ISA description somehow val io = IO(new Bundle { @@ -809,23 +888,48 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val mvout_cmd = Wire(new RoCCCommand) mvout_cmd := DontCare mvout_cmd.inst.funct := STORE_CMD - mvout_cmd.rs1 := 0.U//dram_addr - mvout_cmd.rs2 := 0.U//(I << 48.U) | (J << 32.U) | spad_addr + mvout_cmd.rs1 := 0.U // dram_addr + mvout_cmd.rs2 := 0.U // mvout_cmd_rs2 val pre_pool_config_cmd = Wire(new RoCCCommand) pre_pool_config_cmd := DontCare pre_pool_config_cmd.inst.funct := CONFIG_CMD - pre_pool_config_cmd.rs1 := (ocols << 56) | (orows << 48) | (pocols << 40) | (porows << 32) | (pool_out_dim << 24) | - (plpad << 10) | (pupad << 8) | (pool_size << 6) | (pool_stride << 4) | - (req.activation << 2) | // TODO magic numbers - CONFIG_STORE - pre_pool_config_cmd.rs2 := (ACC_SCALE_NO_CHANGE << 32) | (out_channels * (input_w / 8).U) + val pre_pool_config_cmd_rs1 = Wire(new ConfigMvoutRs1) + pre_pool_config_cmd_rs1 := DontCare + pre_pool_config_cmd_rs1.ocols := ocols + pre_pool_config_cmd_rs1.orows := orows + pre_pool_config_cmd_rs1.pocols := pocols + pre_pool_config_cmd_rs1.porows := porows + pre_pool_config_cmd_rs1.pool_out_dim := pool_out_dim + pre_pool_config_cmd_rs1.lpad := plpad + pre_pool_config_cmd_rs1.upad := pupad + pre_pool_config_cmd_rs1.pool_size := pool_size + pre_pool_config_cmd_rs1.pool_stride := pool_stride + pre_pool_config_cmd_rs1.activation := req.activation + pre_pool_config_cmd_rs1._unused := CONFIG_STORE + pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt() + + val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) + pre_pool_config_cmd_rs2 := DontCare + pre_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE + pre_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U + pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt() val post_pool_config_cmd = Wire(new RoCCCommand) post_pool_config_cmd := DontCare post_pool_config_cmd.inst.funct := CONFIG_CMD - post_pool_config_cmd.rs1 := (req.activation << 2) | CONFIG_STORE // TODO magic numbers - post_pool_config_cmd.rs2 := (ACC_SCALE_NO_CHANGE << 32) | (out_channels * (input_w / 8).U) + + val post_pool_config_cmd_rs1 = Wire(new ConfigMvoutRs1) + post_pool_config_cmd_rs1 := DontCare + post_pool_config_cmd_rs1.activation := req.activation + post_pool_config_cmd_rs1._unused := CONFIG_STORE + post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt() + + val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) + post_pool_config_cmd_rs2 := DontCare + post_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE + post_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U + post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt() val pool_cmd = Wire(new RoCCCommand) pool_cmd := DontCare @@ -859,11 +963,22 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: when (command_p.io.out.bits.cmd.inst.funct === STORE_CMD) { val o = command_p.io.out.bits when (o.is_pool) { + val pool_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) + pool_mvout_cmd_rs2 := DontCare + pool_mvout_cmd_rs2.num_cols := o.channels + pool_mvout_cmd_rs2.local_addr := o.pool_spad_addr.asTypeOf(pool_mvout_cmd_rs2.local_addr) + io.cmd.bits.rs1 := o.pool_dram_addr - io.cmd.bits.rs2 := (o.channels << 32.U) | o.pool_spad_addr + io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt() } .otherwise { + val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) + mvout_cmd_rs2 := DontCare + mvout_cmd_rs2.num_rows := o.I.asUInt() + mvout_cmd_rs2.num_cols := o.J.asUInt() + mvout_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvout_cmd_rs2.local_addr) + io.cmd.bits.rs1 := o.dram_addr - io.cmd.bits.rs2 := (o.I << 48) | (o.J << 32) | o.spad_addr + io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt() } } @@ -1016,7 +1131,10 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s } class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int, - max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int) + max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, + config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, + config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs) (implicit p: Parameters) extends Module { val large_iterator_bitwidth = 16 val small_iterator_bitwidth = 16 // 8 @@ -1049,11 +1167,11 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I // Create inner modules val latency = 2 - val ld_bias = Module(new LoopConvLdBias(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, acc_w, max_block_len_acc, concurrent_loops, latency)) - val ld_input = Module(new LoopConvLdInput(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency)) - val ld_weights = Module(new LoopConvLdWeight(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency)) - val ex = Module(new LoopConvExecute(block_size, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, latency)) - val st = Module(new LoopConvSt(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, input_w, concurrent_loops, latency)) + val ld_bias = Module(new LoopConvLdBias(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, acc_w, max_block_len_acc, concurrent_loops, latency, config_mvin_rs1_t, mvin_rs2_t)) + val ld_input = Module(new LoopConvLdInput(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency, config_mvin_rs1_t, mvin_rs2_t)) + val ld_weights = Module(new LoopConvLdWeight(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, latency, config_mvin_rs1_t, mvin_rs2_t)) + val ex = Module(new LoopConvExecute(block_size, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, latency, config_ex_rs1_t, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t)) + val st = Module(new LoopConvSt(block_size, coreMaxAddrBits, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, max_acc_addr, input_w, concurrent_loops, latency, config_mvout_rs2_t, mvout_rs2_t)) // Create command queue val cmd = Queue(io.in) @@ -1339,10 +1457,15 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I object LoopConv { def apply(in: DecoupledIO[RoCCCommand], ld_utilization: UInt, st_utilization: UInt, ex_utilization: UInt, block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int, - max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int) + max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, + config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, + mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs) (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = { val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts, - max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes)) + max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes, + config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t, + compute_rs1_t, compute_rs2_t)) mod.io.in <> in mod.io.ld_utilization := ld_utilization mod.io.st_utilization := st_utilization diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 2502d157..ea1c3ed6 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -23,7 +23,7 @@ class LoopMatmulLdAReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat } class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, input_w: Int, - max_block_len: Int, concurrent_loops: Int) + max_block_len: Int, concurrent_loops: Int, mvin_rs2_t: MvinRs2) (implicit p: Parameters) extends Module { val io = IO(new Bundle { val req = Flipped(Decoupled(new LoopMatmulLdAReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, concurrent_loops))) @@ -70,7 +70,13 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd := DontCare mvin_cmd.inst.funct := LOAD_CMD mvin_cmd.rs1 := dram_addr - mvin_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr + + val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) + mvin_cmd_rs2 := DontCare + mvin_cmd_rs2.num_rows := rows.asUInt() + mvin_cmd_rs2.num_cols := cols.asUInt() + mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() io.req.ready := state === idle io.i := i @@ -121,7 +127,7 @@ class LoopMatmulLdBReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat } class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, input_w: Int, - max_block_len: Int, concurrent_loops: Int) + max_block_len: Int, concurrent_loops: Int, mvin_rs2_t: MvinRs2) (implicit p: Parameters) extends Module { val io = IO(new Bundle { val req = Flipped(Decoupled(new LoopMatmulLdBReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, concurrent_loops))) @@ -171,7 +177,13 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd := DontCare mvin_cmd.inst.funct := LOAD2_CMD mvin_cmd.rs1 := dram_addr - mvin_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr + + val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) + mvin_cmd_rs2 := DontCare + mvin_cmd_rs2.num_rows := rows.asUInt() + mvin_cmd_rs2.num_cols := cols.asUInt() + mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() io.req.ready := state === idle io.k := k @@ -222,7 +234,7 @@ class LoopMatmulLdDReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat } class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, - acc_w: Int, max_block_len: Int, max_block_len_acc: Int, concurrent_loops: Int) + acc_w: Int, max_block_len: Int, max_block_len_acc: Int, concurrent_loops: Int, mvin_rs2_t: MvinRs2) (implicit p: Parameters) extends Module { val io = IO(new Bundle { val req = Flipped(Decoupled(new LoopMatmulLdDReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, concurrent_loops))) @@ -261,7 +273,13 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd := DontCare mvin_cmd.inst.funct := LOAD3_CMD mvin_cmd.rs1 := dram_addr - mvin_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr + + val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) + mvin_cmd_rs2 := DontCare + mvin_cmd_rs2.num_rows := rows.asUInt() + mvin_cmd_rs2.num_cols := cols.asUInt() + mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() io.req.ready := state === idle io.idle := state === idle @@ -312,7 +330,9 @@ class LoopMatmulExecuteReq(val block_size: Int, val coreMaxAddrBits: Int, val it val loop_id = UInt(log2Up(concurrent_loops).W) } -class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, max_acc_addr: Int, concurrent_loops: Int) +class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, max_acc_addr: Int, concurrent_loops: Int, + preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs) (implicit p: Parameters) extends Module { val GARBAGE_ADDR = (~0.U(32.W)).asUInt() @@ -380,14 +400,40 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth val pre_cmd = Wire(new RoCCCommand) pre_cmd := DontCare pre_cmd.inst.funct := PRELOAD_CMD - pre_cmd.rs1 := pre_addr | (b_cols << 32).asUInt() | (b_rows << 48).asUInt() - pre_cmd.rs2 := out_addr | (c_cols << 32).asUInt() | (c_rows << 48).asUInt() + + val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType) + pre_cmd_rs1 := DontCare + pre_cmd_rs1.num_rows := b_rows.asUInt() + pre_cmd_rs1.num_cols := b_cols.asUInt() + pre_cmd_rs1.local_addr := pre_addr.asTypeOf(pre_cmd_rs1.local_addr) + + val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType) + pre_cmd_rs2 := DontCare + pre_cmd_rs2.num_rows := c_rows.asUInt() + pre_cmd_rs2.num_cols := c_cols.asUInt() + pre_cmd_rs2.local_addr := out_addr.asTypeOf(pre_cmd_rs2.local_addr) + + pre_cmd.rs1 := pre_cmd_rs1.asUInt() + pre_cmd.rs2 := pre_cmd_rs2.asUInt() val comp_cmd = Wire(new RoCCCommand()) comp_cmd := DontCare comp_cmd.inst.funct := Mux(i === 0.U, COMPUTE_AND_FLIP_CMD, COMPUTE_AND_STAY_CMD) - comp_cmd.rs1 := a_addr | (a_cols << 32).asUInt() | (a_rows << 48).asUInt() - comp_cmd.rs2 := GARBAGE_ADDR | (block_size.U << 32).asUInt() | (block_size.U << 48).asUInt() + + val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType) + comp_cmd_rs1 := DontCare + comp_cmd_rs1.num_rows := a_rows.asUInt() + comp_cmd_rs1.num_cols := a_cols.asUInt() + comp_cmd_rs1.local_addr := a_addr.asTypeOf(comp_cmd_rs1.local_addr) + + val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType) + comp_cmd_rs2 := DontCare + comp_cmd_rs2.num_rows := block_size.U + comp_cmd_rs2.num_cols := block_size.U + comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr) + + comp_cmd.rs1 := comp_cmd_rs1.asUInt() + comp_cmd.rs2 := comp_cmd_rs2.asUInt() io.req.ready := state === idle io.k := k @@ -448,7 +494,7 @@ class LoopMatmulStCReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat val loop_id = UInt(log2Up(concurrent_loops).W) } -class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, max_block_len: Int, concurrent_loops: Int) +class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, max_block_len: Int, concurrent_loops: Int, mvout_rs2_t: MvoutRs2) (implicit p: Parameters) extends Module { val io = IO(new Bundle { val req = Flipped(Decoupled(new LoopMatmulStCReq(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, concurrent_loops))) @@ -494,7 +540,13 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvout_cmd := DontCare mvout_cmd.inst.funct := STORE_CMD mvout_cmd.rs1 := dram_addr - mvout_cmd.rs2 := (rows << 48).asUInt() | (cols << 32).asUInt() | sp_addr + + val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) + mvout_cmd_rs2 := DontCare + mvout_cmd_rs2.num_rows := rows.asUInt() + mvout_cmd_rs2.num_cols := cols.asUInt() + mvout_cmd_rs2.local_addr := sp_addr.asTypeOf(mvout_cmd_rs2.local_addr) + mvout_cmd.rs2 := mvout_cmd_rs2.asUInt() io.req.ready := state === idle io.j := j @@ -606,7 +658,9 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val } class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int, - max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int) + max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, + mvin_rs2_t: MvinRs2, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, mvout_rs2_t: MvoutRs2) (implicit p: Parameters) extends Module { val iterator_bitwidth = 16 val max_block_len = (dma_max_bytes / (block_size * input_w / 8)) max 1 @@ -635,11 +689,11 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: val loop_being_configured = loops(loop_being_configured_id) // Create inner modules - val ldA = Module(new LoopMatmulLdA(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops)) - val ldB = Module(new LoopMatmulLdB(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops)) - val ldD = Module(new LoopMatmulLdD(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, max_block_len_acc, concurrent_loops)) - val ex = Module(new LoopMatmulExecute(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops)) - val stC = Module(new LoopMatmulStC(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, concurrent_loops)) + val ldA = Module(new LoopMatmulLdA(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t)) + val ldB = Module(new LoopMatmulLdB(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t)) + val ldD = Module(new LoopMatmulLdD(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, max_block_len_acc, concurrent_loops, mvin_rs2_t)) + val ex = Module(new LoopMatmulExecute(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t)) + val stC = Module(new LoopMatmulStC(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, concurrent_loops, mvout_rs2_t)) // Create command queue val cmd = Queue(io.in) @@ -912,10 +966,13 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: object LoopMatmul { def apply(in: DecoupledIO[RoCCCommand], ld_utilization: UInt, st_utilization: UInt, ex_utilization: UInt, block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: Int, max_exs: Int, max_sts: Int, - max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int) + max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, + mvin_rs2_t: MvinRs2, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, mvout_rs2_t: MvoutRs2) (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = { val mod = Module(new LoopMatmul(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts, - max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes)) + max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes, + mvin_rs2_t, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t, mvout_rs2_t)) mod.io.in <> in mod.io.ld_utilization := ld_utilization mod.io.st_utilization := st_utilization diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala index b0e38b42..50efcfe5 100644 --- a/src/main/scala/gemmini/StoreController.scala +++ b/src/main/scala/gemmini/StoreController.scala @@ -41,7 +41,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val block_cols = meshColumns * tileColumns val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1 - val activation = Reg(UInt(2.W)) // TODO magic number + val activation = Reg(UInt(GemminiISA.CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W)) val acc_scale = Reg(acc_scale_args.multiplicand_t) //val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) @@ -49,15 +49,15 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val block_counter = RegInit(0.U(8.W)) // TODO magic number // Pooling variables - val pool_stride = Reg(UInt(2.W)) // When this is 0, pooling is disabled // TODO magic number - val pool_size = Reg(UInt(2.W)) // TODO magic number - val pool_out_dim = Reg(UInt(8.W)) // TODO magic number - val pool_porows = Reg(UInt(8.W)) // TODO magic number - val pool_pocols = Reg(UInt(8.W)) // TODO magic number - val pool_orows = Reg(UInt(8.W)) // TODO magic number - val pool_ocols = Reg(UInt(8.W)) // TODO magic number - val pool_upad = Reg(UInt(2.W)) // TODO magic number - val pool_lpad = Reg(UInt(2.W)) // TODO magic number + val pool_stride = Reg(UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W)) // When this is 0, pooling is disabled + val pool_size = Reg(UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W)) + val pool_out_dim = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH.W)) + val pool_porows = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH.W)) + val pool_pocols = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH.W)) + val pool_orows = Reg(UInt(CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH.W)) + val pool_ocols = Reg(UInt(CONFIG_MVOUT_RS1_OUT_COLS_WIDTH.W)) + val pool_upad = Reg(UInt(CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH.W)) + val pool_lpad = Reg(UInt(CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH.W)) val porow_counter = RegInit(0.U(pool_porows.getWidth.W)) val pocol_counter = RegInit(0.U(pool_pocols.getWidth.W)) @@ -78,22 +78,26 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm // Commands val cmd = Queue(io.cmd, st_queue_length) val vaddr = cmd.bits.cmd.rs1 - val localaddr = cmd.bits.cmd.rs2.asTypeOf(local_addr_t) - val cols = cmd.bits.cmd.rs2(32 + mvout_cols_bits - 1, 32) // TODO magic numbers - val rows = cmd.bits.cmd.rs2(48 + mvout_rows_bits - 1, 48) // TODO magic numbers + val mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)) + val localaddr = mvout_rs2.local_addr + val cols = mvout_rs2.num_cols + val rows = mvout_rs2.num_rows val blocks = (cols / block_cols.U) + (cols % block_cols.U =/= 0.U) - val config_stride = cmd.bits.cmd.rs2(31, 0) // TODO magic numbers - val config_activation = cmd.bits.cmd.rs1(3, 2) // TODO magic numbers - val config_acc_scale = cmd.bits.cmd.rs2(63, 32) // TODO magic numbers - val config_pool_stride = cmd.bits.cmd.rs1(5, 4) // TODO magic numbers - val config_pool_size = cmd.bits.cmd.rs1(7, 6) // TODO magic numbers - val config_pool_out_dim = cmd.bits.cmd.rs1(31, 24) // TODO magic numbers - val config_porows = cmd.bits.cmd.rs1(39, 32) // TODO magic numbers - val config_pocols = cmd.bits.cmd.rs1(47, 40) // TODO magic numbers - val config_orows = cmd.bits.cmd.rs1(55, 48) // TODO magic numbers - val config_ocols = cmd.bits.cmd.rs1(63, 56) // TODO magic numbers - val config_upad = cmd.bits.cmd.rs1(9, 8) // TODO magic numbers - val config_lpad = cmd.bits.cmd.rs1(11, 10) // TODO magic numbers + + val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1) + val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32)) + val config_stride = config_mvout_rs2.stride + val config_activation = config_mvout_rs1.activation + val config_acc_scale = config_mvout_rs2.acc_scale + val config_pool_stride = config_mvout_rs1.pool_stride + val config_pool_size = config_mvout_rs1.pool_size + val config_pool_out_dim = config_mvout_rs1.pool_out_dim + val config_porows = config_mvout_rs1.porows + val config_pocols = config_mvout_rs1.pocols + val config_orows = config_mvout_rs1.orows + val config_ocols = config_mvout_rs1.ocols + val config_upad = config_mvout_rs1.upad + val config_lpad = config_mvout_rs1.lpad val mstatus = cmd.bits.cmd.status From 03fa3d1928b8e8f7bdba97f15beb91c1383c8f30 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Thu, 11 Nov 2021 20:18:59 -0800 Subject: [PATCH 04/11] Merge IISWC Tutorial Changes (#158) Added new convenience scripts and config files --- .gitignore | 4 + scripts/build-midas.sh | 19 ++ scripts/build-onnx-inference.sh | 7 + scripts/build-onnx-training.sh | 7 + scripts/build-spike.sh | 12 ++ scripts/build-vcs.sh | 5 + scripts/build-verilator.sh | 5 + scripts/run-midas.sh | 43 ++++ scripts/run-spike.sh | 22 ++ scripts/run-vcs-debug.sh | 22 ++ scripts/run-vcs.sh | 23 +++ scripts/run-verilator-debug.sh | 23 +++ scripts/run-verilator.sh | 23 +++ scripts/setup-paths.sh | 36 ++++ software/gemmini-rocc-tests | 2 +- src/main/scala/gemmini/AccumulatorMem.scala | 25 ++- src/main/scala/gemmini/AccumulatorScale.scala | 41 ++-- src/main/scala/gemmini/Arithmetic.scala | 57 +----- src/main/scala/gemmini/Configs.scala | 85 ++++---- src/main/scala/gemmini/ConfigsFP.scala | 30 +-- src/main/scala/gemmini/Controller.scala | 123 +++++------- src/main/scala/gemmini/CounterFile.scala | 28 ++- src/main/scala/gemmini/CustomCPUConfigs.scala | 20 ++ src/main/scala/gemmini/CustomConfigs.scala | 60 ++++++ src/main/scala/gemmini/CustomSoCConfigs.scala | 24 +++ src/main/scala/gemmini/DMA.scala | 81 ++++++-- src/main/scala/gemmini/DSEConfigs.scala | 27 +-- .../scala/gemmini/ExecuteController.scala | 29 ++- src/main/scala/gemmini/FrontendTLB.scala | 20 +- src/main/scala/gemmini/GemminiConfigs.scala | 189 +++++++++++------- src/main/scala/gemmini/Im2Col.scala | 4 +- src/main/scala/gemmini/LoadController.scala | 11 +- src/main/scala/gemmini/LoopConv.scala | 38 ++-- .../{ROB.scala => ReservationStation.scala} | 56 ++++-- src/main/scala/gemmini/Scratchpad.scala | 39 ++-- src/main/scala/gemmini/StoreController.scala | 14 +- .../gemmini/TransposePreloadUnroller.scala | 1 + src/main/scala/gemmini/XactTracker.scala | 25 ++- 38 files changed, 891 insertions(+), 389 deletions(-) create mode 100755 scripts/build-midas.sh create mode 100755 scripts/build-onnx-inference.sh create mode 100755 scripts/build-onnx-training.sh create mode 100755 scripts/build-spike.sh create mode 100755 scripts/build-vcs.sh create mode 100755 scripts/build-verilator.sh create mode 100755 scripts/run-midas.sh create mode 100755 scripts/run-spike.sh create mode 100755 scripts/run-vcs-debug.sh create mode 100755 scripts/run-vcs.sh create mode 100755 scripts/run-verilator-debug.sh create mode 100755 scripts/run-verilator.sh create mode 100755 scripts/setup-paths.sh create mode 100644 src/main/scala/gemmini/CustomCPUConfigs.scala create mode 100644 src/main/scala/gemmini/CustomConfigs.scala create mode 100644 src/main/scala/gemmini/CustomSoCConfigs.scala rename src/main/scala/gemmini/{ROB.scala => ReservationStation.scala} (87%) diff --git a/.gitignore b/.gitignore index 376625eb..53469249 100644 --- a/.gitignore +++ b/.gitignore @@ -338,3 +338,7 @@ project/plugins/project/ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* +# Gemmini specific +configs/ +generated-src/ + diff --git a/scripts/build-midas.sh b/scripts/build-midas.sh new file mode 100755 index 00000000..cfc2347f --- /dev/null +++ b/scripts/build-midas.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +if [ "$1" == "--help" ]; then + echo usage: $0 DRAM_CONTROLLER_MODEL + echo " " DRAM_CONTROLLER_MODEL: Either DDR3FCFS or DDR3FRFCFS or DDR3FRFCFSLLC4MB + echo " " FCFS is "first come, first serve" + echo " " FRFCFS is "first ready, first come, first serve" + exit +elif [ "$1" == "" ]; then + echo DRAM model must be provided + exit 1 +fi + +cd ../../sims/firesim/ +source sourceme-f1-manager.sh &> build.log + +cd sim/ +make verilator TARGET_CONFIG=${1}_WithDefaultFireSimBridges_WithFireSimConfigTweaks_chipyard.CustomGemminiSoCConfig + diff --git a/scripts/build-onnx-inference.sh b/scripts/build-onnx-inference.sh new file mode 100755 index 00000000..23742f5c --- /dev/null +++ b/scripts/build-onnx-inference.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/ +rm -rf ./build/ +./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=ON onnxruntime_SYSTOLIC_FP32=OFF +cd ./systolic_runner/imagenet_runner/ +./build.sh --parallel --enable_training --config=Debug diff --git a/scripts/build-onnx-training.sh b/scripts/build-onnx-training.sh new file mode 100755 index 00000000..55c9bc7b --- /dev/null +++ b/scripts/build-onnx-training.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/ +rm -rf ./build/ +./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=OFF onnxruntime_SYSTOLIC_FP32=ON +cd ./systolic_runner/imagenet_trainer/ +./build.sh --enable_training diff --git a/scripts/build-spike.sh b/scripts/build-spike.sh new file mode 100755 index 00000000..0b678a0b --- /dev/null +++ b/scripts/build-spike.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +export GEMMINI_ONLY_GENERATE_GEMMINI_H=1 + +cd ../../sims/verilator/ +echo Generating new gemmini_params.h file... +make verilog CONFIG=CustomGemminiSoCConfig &> build.log + +cd - +cp software/gemmini-rocc-tests/include/gemmini_params.h ../../toolchains/esp-tools/riscv-isa-sim/gemmini/gemmini_params.h +cd ../../toolchains/esp-tools/riscv-isa-sim/build +make && make install diff --git a/scripts/build-vcs.sh b/scripts/build-vcs.sh new file mode 100755 index 00000000..d18c7e5e --- /dev/null +++ b/scripts/build-vcs.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +cd ../../sims/vcs/ +make $@ CONFIG=CustomGemminiSoCConfig + diff --git a/scripts/build-verilator.sh b/scripts/build-verilator.sh new file mode 100755 index 00000000..65053fc2 --- /dev/null +++ b/scripts/build-verilator.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +cd ../../sims/verilator/ +make $@ CONFIG=CustomGemminiSoCConfig + diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh new file mode 100755 index 00000000..115125e5 --- /dev/null +++ b/scripts/run-midas.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [ "$1" == "--help" ]; then + echo usage: $0 DRAM_CONTROLLER_MODEL binary + echo " " DRAM_CONTROLLER_MODEL: Either DDR3FCFS or DDR3FRFCFS or DDR3FRFCFSLLC4MB + echo " " FCFS is "first come, first serve" + echo " " FRFCFS is "first ready, first come, first serve" + exit +elif [ "$1" == "" ]; then + echo DRAM model must be provided + exit 1 +fi + +path="" +suffix="" + +binary="$2" + +if [ "$binary" == "" ]; then + echo You must provide a binary to run +elif [ -f "software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then + path="$PWD/software/gemmini-rocc-tests/build/bareMetalC/" + suffix="-baremetal" +elif [ -f "software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then + path="$PWD/software/gemmini-rocc-tests/build/imagenet/" + suffix="-baremetal" +elif [ ! -f "$binary" ]; then + echo Binary not found + exit 1 +fi + +gemminidir="$PWD" + +cd ../../sims/firesim/ +source sourceme-f1-manager.sh &> build.log + +cd sim/ + +cd generated-src/f1/FireSim-${1}_WithDefaultFireSimBridges_WithFireSimConfigTweaks_chipyard.CustomGemminiSoCConfig-BaseF1Config + +./VFireSim ${path}${binary}${suffix} \ + +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +dramsim +max-cycles=100000000 \ + 2>/dev/null diff --git a/scripts/run-spike.sh b/scripts/run-spike.sh new file mode 100755 index 00000000..b5343e5d --- /dev/null +++ b/scripts/run-spike.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +path="" +suffix="" + +binary="$1" + +if [ "$binary" == "" ]; then + echo You must provide a binary to run +elif [ -f "software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then + path="software/gemmini-rocc-tests/build/bareMetalC/" + suffix="-baremetal" +elif [ -f "software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then + path="software/gemmini-rocc-tests/build/imagenet/" + suffix="-baremetal" +elif [ ! -f "$binary" ]; then + echo Binary not found + exit 1 +fi + +spike --extension=gemmini ${path}${binary}${suffix} + diff --git a/scripts/run-vcs-debug.sh b/scripts/run-vcs-debug.sh new file mode 100755 index 00000000..a0b9b9e1 --- /dev/null +++ b/scripts/run-vcs-debug.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +path="" +suffix="" + +binary="$1" + +if [ "$binary" == "" ]; then + echo You must provide a binary to run +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/" + suffix="-baremetal" +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/" + suffix="-baremetal" +elif [ ! -f "$binary" ]; then + echo Binary not found + exit 1 +fi + +cd ../../sims/vcs/ +./simv-chipyard-CustomGemminiSoCConfig-debug ${path}${binary}${suffix} diff --git a/scripts/run-vcs.sh b/scripts/run-vcs.sh new file mode 100755 index 00000000..ede89561 --- /dev/null +++ b/scripts/run-vcs.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +path="" +suffix="" + +binary="$1" + +if [ "$binary" == "" ]; then + echo You must provide a binary to run +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/" + suffix="-baremetal" +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/" + suffix="-baremetal" +elif [ ! -f "$binary" ]; then + echo Binary not found + exit 1 +fi + +cd ../../sims/vcs/ +./simv-chipyard-CustomGemminiSoCConfig ${path}${binary}${suffix} + diff --git a/scripts/run-verilator-debug.sh b/scripts/run-verilator-debug.sh new file mode 100755 index 00000000..f856429b --- /dev/null +++ b/scripts/run-verilator-debug.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +path="" +suffix="" + +binary="$1" + +if [ "$binary" == "" ]; then + echo You must provide a binary to run +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/" + suffix="-baremetal" +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/" + suffix="-baremetal" +elif [ ! -f "$binary" ]; then + echo Binary not found + exit 1 +fi + +cd ../../sims/verilator/ +./simulator-chipyard-CustomGemminiSoCConfig-debug -v waveform.vcd ${path}${binary}${suffix} + diff --git a/scripts/run-verilator.sh b/scripts/run-verilator.sh new file mode 100755 index 00000000..6e4ede18 --- /dev/null +++ b/scripts/run-verilator.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +path="" +suffix="" + +binary="$1" + +if [ "$binary" == "" ]; then + echo You must provide a binary to run +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/bareMetalC/" + suffix="-baremetal" +elif [ -f "../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/${binary}-baremetal" ]; then + path="../../generators/gemmini/software/gemmini-rocc-tests/build/imagenet/" + suffix="-baremetal" +elif [ ! -f "$binary" ]; then + echo Binary not found + exit 1 +fi + +cd ../../sims/verilator/ +./simulator-chipyard-CustomGemminiSoCConfig ${path}${binary}${suffix} + diff --git a/scripts/setup-paths.sh b/scripts/setup-paths.sh new file mode 100755 index 00000000..a4e5dfd0 --- /dev/null +++ b/scripts/setup-paths.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ ! -d configs ]; then + mkdir configs/ +fi + +if [ ! -d generated-src ]; then + mkdir generated-src/ +fi + +if [ ! -f configs/GemminiDefaultConfigs.scala ]; then + ln -s $PWD/src/main/scala/gemmini/Configs.scala configs/GemminiDefaultConfigs.scala +fi + +if [ ! -f configs/GemminiCustomConfigs.scala ]; then + ln -s $PWD/src/main/scala/gemmini/CustomConfigs.scala configs/GemminiCustomConfigs.scala +fi + +if [ ! -f configs/CPUConfigs.scala ]; then + sed '1,1d; $d' $PWD/src/main/scala/gemmini/CustomCPUConfigs.scala > ../chipyard/src/main/scala/config/GemminiCPUConfigs.scala + ln -s $PWD/../chipyard/src/main/scala/config/GemminiCPUConfigs.scala configs/CPUConfigs.scala +fi + +if [ ! -f configs/SoCConfigs.scala ]; then + sed '1,1d; $d' $PWD/src/main/scala/gemmini/CustomSoCConfigs.scala > ../chipyard/src/main/scala/config/GemminiSoCConfigs.scala + ln -s $PWD/../chipyard/src/main/scala/config/GemminiSoCConfigs.scala configs/SoCConfigs.scala +fi + +if [ ! -f generated-src/verilator ] && [ ! -d generated-src/verilator ]; then + ln -s $PWD/../../sims/verilator/generated-src/ generated-src/verilator 2>/dev/null +fi + +if [ ! -f generated-src/vcs ] && [ ! -d generated-src/vcs ]; then + ln -s $PWD/../../sims/vcs/generated-src/ generated-src/vcs 2>/dev/null +fi + diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index fff5ae7a..3aaa2307 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit fff5ae7a1e3770f7e18b9675784185e9dd9d8d55 +Subproject commit 3aaa230733a9eba6edf4d14243d84595e017522f diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala index 4d7d1e46..89a39182 100644 --- a/src/main/scala/gemmini/AccumulatorMem.scala +++ b/src/main/scala/gemmini/AccumulatorMem.scala @@ -53,8 +53,8 @@ class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]] } class AccumulatorMem[T <: Data, U <: Data]( - n: Int, t: Vec[Vec[T]], scale_args: ScaleArguments[T, U], - acc_singleported: Boolean, num_acc_sub_banks: Int + n: Int, t: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, + acc_singleported: Boolean, acc_sub_banks: Int ) (implicit ev: Arithmetic[T]) extends Module { // TODO Do writes in this module work with matrices of size 2? If we try to read from an address right after writing @@ -69,7 +69,7 @@ class AccumulatorMem[T <: Data, U <: Data]( import ev._ // TODO unify this with TwoPortSyncMemIO - val io = IO(new AccumulatorMemIO(n, t, scale_args.multiplicand_t)) + val io = IO(new AccumulatorMemIO(n, t, scale_t)) // For any write operation, we spend 2 cycles reading the existing address out, buffering it in a register, and then @@ -109,10 +109,10 @@ class AccumulatorMem[T <: Data, U <: Data]( reads(1).bits := io.read.req.bits.addr reads(1).ready := true.B block_read_req := !reads(1).ready - for (i <- 0 until num_acc_sub_banks) { - def isThisBank(addr: UInt) = addr(log2Ceil(num_acc_sub_banks)-1,0) === i.U - def getBankIdx(addr: UInt) = addr >> log2Ceil(num_acc_sub_banks) - val mem = SyncReadMem(n / num_acc_sub_banks, Vec(mask_len, mask_elem)) + for (i <- 0 until acc_sub_banks) { + def isThisBank(addr: UInt) = addr(log2Ceil(acc_sub_banks)-1,0) === i.U + def getBankIdx(addr: UInt): UInt = (addr >> log2Ceil(acc_sub_banks)).asUInt() + val mem = SyncReadMem(n / acc_sub_banks, Vec(mask_len, mask_elem)) val ren = WireInit(false.B) val raddr = WireInit(getBankIdx(reads(0).bits)) @@ -123,7 +123,7 @@ class AccumulatorMem[T <: Data, U <: Data]( val valid = Bool() val data = Vec(mask_len, mask_elem) val mask = Vec(mask_len, Bool()) - val addr = UInt(log2Ceil(n/num_acc_sub_banks).W) + val addr = UInt(log2Ceil(n/acc_sub_banks).W) override def cloneType: this.type = new W_Q_Entry(mask_len, mask_elem).asInstanceOf[this.type] } val w_q = Reg(Vec(nEntries, new W_Q_Entry(mask_len, mask_elem))) @@ -134,6 +134,7 @@ class AccumulatorMem[T <: Data, U <: Data]( isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr && ((io.write.bits.mask.asUInt & e.mask.asUInt) =/= 0.U) )) + when (io.read.req.valid && isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) { reads(1).ready := false.B } @@ -149,7 +150,7 @@ class AccumulatorMem[T <: Data, U <: Data]( val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask)) val waddr = Mux1H(w_q_head.asBools, w_q.map(_.addr)) when (wen) { - w_q_head := w_q_head << 1 | w_q_head(nEntries-1) + w_q_head := (w_q_head << 1).asUInt() | w_q_head(nEntries-1) for (i <- 0 until nEntries) { when (w_q_head(i)) { w_q(i).valid := false.B @@ -159,7 +160,7 @@ class AccumulatorMem[T <: Data, U <: Data]( when (w_buf_valid && isThisBank(waddr_buf)) { assert(!((w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_))) - w_q_tail := w_q_tail << 1 | w_q_tail(nEntries-1) + w_q_tail := (w_q_tail << 1).asUInt() | w_q_tail(nEntries-1) for (i <- 0 until nEntries) { when (w_q_tail(i)) { w_q(i).valid := true.B @@ -198,7 +199,7 @@ class AccumulatorMem[T <: Data, U <: Data]( } } - val q = Module(new Queue(new AccumulatorReadResp(t, scale_args.multiplicand_t, log2Ceil(t.head.head.getWidth)), 1, true, true)) + val q = Module(new Queue(new AccumulatorReadResp(t, scale_t, log2Ceil(t.head.head.getWidth)), 1, true, true)) q.io.enq.bits.data := read_rdata q.io.enq.bits.scale := RegNext(io.read.req.bits.scale) q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift) @@ -228,8 +229,6 @@ class AccumulatorMem[T <: Data, U <: Data]( !block_read_req ) - // io.write.current_waddr.valid := mem.io.wen - // io.write.current_waddr.bits := mem.io.waddr io.write.ready := !io.write.bits.acc || (!(io.write.bits.addr === waddr_buf && w_buf_valid) && !(io.write.bits.addr === RegNext(io.write.bits.addr) && RegNext(io.write.fire()))) diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index 2069bc66..5e4997f8 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -30,7 +30,7 @@ class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data]( shift_width, rDataType).asInstanceOf[this.type] } -class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U, scale_args: ScaleArguments[T, U]) extends Bundle { +class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extends Bundle { val shift_width = log2Ceil(t.getWidth) val scale = u.cloneType @@ -40,24 +40,23 @@ class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U, scale_ val full_data = t.cloneType val id = UInt(2.W) // TODO hardcoded val index = UInt() - override def cloneType: this.type = new AccScaleDataWithIndex(t, u, scale_args: ScaleArguments[T, U]).asInstanceOf[this.type] + override def cloneType: this.type = new AccScaleDataWithIndex(t, u).asInstanceOf[this.type] } -class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_args: ScaleArguments[T, U])(implicit ev: Arithmetic[T]) extends Module { - val u = scale_args.multiplicand_t +class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, latency: Int, has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module { + val u = scale_t val io = IO(new Bundle { - val in = Input(Valid(new AccScaleDataWithIndex(t, u, scale_args)(ev))) - val out = Output(Valid(new AccScaleDataWithIndex(t, u, scale_args)(ev))) + val in = Input(Valid(new AccScaleDataWithIndex(t, u)(ev))) + val out = Output(Valid(new AccScaleDataWithIndex(t, u)(ev))) }) import ev._ - val latency = scale_args.latency val out = WireInit(io.in) - val e_scaled = scale_args.scale_func(io.in.bits.data, io.in.bits.scale) + val e_scaled = scale_func(io.in.bits.data, io.in.bits.scale) val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) val e_act = MuxCase(e_clipped, Seq( - (io.in.bits.act === Activation.RELU) -> e_clipped.relu, - (io.in.bits.act === Activation.RELU6) -> e_clipped.relu6(io.in.bits.relu6_shift))) + (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e_clipped.relu, + (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU6) -> e_clipped.relu6(io.in.bits.relu6_shift))) out.bits.data := e_act io.out := Pipe(out, latency) @@ -68,9 +67,13 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( fullDataType: Vec[Vec[T]], rDataType: Vec[Vec[T]], scale_t: U, shift_width: Int, read_small_data: Boolean, read_full_data: Boolean, - scale_args: ScaleArguments[T, U])(implicit ev: Arithmetic[T]) extends Module { + scale_func: (T, U) => T, + num_scale_units: Int, + latency: Int, + has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module { import ev._ + val io = IO(new AccumulatorScaleIO[T,U]( fullDataType, scale_t, shift_width, rDataType )(ev)) @@ -78,9 +81,6 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( val out = Wire(Decoupled(new AccumulatorScaleResp[T]( fullDataType, rDataType)(ev))) - val num_scale_units = scale_args.num_scale_units - val acc_scale_latency = scale_args.latency - if (num_scale_units == -1) { val in = Wire(Decoupled(new AccumulatorReadRespWithFullData(fullDataType, scale_t, shift_width)(ev))) in.valid := io.in.valid @@ -88,11 +88,10 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( in.bits.resp := io.in.bits in.bits.full_data := io.in.bits.data - val pipe_out = Pipeline(in, acc_scale_latency, Seq.fill(acc_scale_latency)((x: AccumulatorReadRespWithFullData[T,U]) => x) :+ { + val pipe_out = Pipeline(in, latency, Seq.fill(latency)((x: AccumulatorReadRespWithFullData[T,U]) => x) :+ { x: AccumulatorReadRespWithFullData[T,U] => val activated_rdata = VecInit(x.resp.data.map(v => VecInit(v.map { e => - // val e_scaled = e >> x.shiftls - val e_scaled = scale_args.scale_func(e, x.resp.scale) + val e_scaled = scale_func(e, x.resp.scale) val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) val e_act = MuxCase(e_clipped, Seq( (x.resp.act === Activation.RELU) -> e_clipped.relu, @@ -148,7 +147,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( tail_oh := (tail_oh << 1) | tail_oh(nEntries-1) } - val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t, scale_args)(ev))) } + val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } for (i <- 0 until nEntries) { for (w <- 0 until width) { @@ -168,16 +167,16 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( } for (i <- 0 until num_scale_units) { val arbIn = inputs.zipWithIndex.filter({ case (_, w) => w % num_scale_units == i }).map(_._1) - val arb = Module(new RRArbiter(new AccScaleDataWithIndex(t, scale_t, scale_args)(ev), arbIn.length)) + val arb = Module(new RRArbiter(new AccScaleDataWithIndex(t, scale_t)(ev), arbIn.length)) arb.io.in <> arbIn arb.io.out.ready := true.B - val arbOut = Reg(Valid(new AccScaleDataWithIndex(t, scale_t, scale_args)(ev))) + val arbOut = Reg(Valid(new AccScaleDataWithIndex(t, scale_t)(ev))) arbOut.valid := arb.io.out.valid arbOut.bits := arb.io.out.bits when (reset.asBool) { arbOut.valid := false.B } - val pipe = Module(new AccScalePipe(t, rDataType, scale_args)(ev, ev)) + val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations)(ev, ev)) pipe.io.in := arbOut val pipe_out = pipe.io.out diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala index 9170b834..a6684ec7 100644 --- a/src/main/scala/gemmini/Arithmetic.scala +++ b/src/main/scala/gemmini/Arithmetic.scala @@ -7,6 +7,14 @@ import chisel3._ import chisel3.util._ import hardfloat._ +// Bundles that represent the raw bits of custom datatypes +case class Float(expWidth: Int, sigWidth: Int) extends Bundle { + val bits = UInt((expWidth + sigWidth).W) + + val bias: Int = (1 << (expWidth-1)) - 1 +} + +// The Arithmetic typeclass which implements various arithmetic operations on custom datatypes abstract class Arithmetic[T <: Data] { implicit def cast(t: T): ArithmeticOps[T] } @@ -248,30 +256,6 @@ object Arithmetic { val result = Wire(Float(self.expWidth, self.sigWidth)) result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) result - - /* - val raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits) - - val shifted_raw = WireInit(raw) - - when (!raw.isZero) { - shifted_raw.sExp := raw.sExp - u.asSInt() - } - - val raw_to_rec_fn_converter = Module(new RoundRawFNToRecFN(self.expWidth, self.sigWidth, options = 0)) // TODO add correct options here so that efficiency may be improved - - raw_to_rec_fn_converter.io.invalidExc := false.B - raw_to_rec_fn_converter.io.infiniteExc := false.B - - raw_to_rec_fn_converter.io.in := shifted_raw - - raw_to_rec_fn_converter.io.roundingMode := consts.round_near_maxMag - raw_to_rec_fn_converter.io.detectTininess := consts.tininess_afterRounding - - val result = Wire(Float(self.expWidth, self.sigWidth)) - result.bits := fNFromRecFN(self.expWidth, self.sigWidth, raw_to_rec_fn_converter.io.out) - result - */ } override def >(t: Float): Bool = { @@ -357,25 +341,6 @@ object Arithmetic { val shifted_rec = muladder.io.out - /* - val six_raw = rawFloatFromIN(signedIn = false.B, in = 6.U(3.W)) - - val shifted_raw = WireInit(six_raw) - - when (!six_raw.isZero) { - shifted_raw.sExp := six_raw.sExp + shift.asSInt() - } - - val raw_to_rec_fn_converter = Module(new RoundRawFNToRecFN(self.expWidth, self.sigWidth, options = 0)) // TODO add correct options here so that efficiency may be improved - raw_to_rec_fn_converter.io.in := shifted_raw - raw_to_rec_fn_converter.io.roundingMode := consts.round_near_maxMag - raw_to_rec_fn_converter.io.detectTininess := consts.tininess_afterRounding - raw_to_rec_fn_converter.io.invalidExc := false.B - raw_to_rec_fn_converter.io.infiniteExc := false.B - - val shifted_rec = raw_to_rec_fn_converter.io.out - */ - // Now, compare self and 6*(2^shift) to calculate the activation function val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) val self_raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits) @@ -400,9 +365,3 @@ object Arithmetic { } } } - -case class Float(expWidth: Int, sigWidth: Int) extends Bundle { - val bits = UInt((expWidth + sigWidth).W) - - val bias: Int = (1 << (expWidth-1)) - 1 -} diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 60f5f7b4..2e172adf 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -15,44 +15,55 @@ import hardfloat._ object GemminiConfigs { val defaultConfig = GemminiArrayConfig[SInt, Float, Float]( - opcodes = OpcodeSet.custom3, + // Datatypes + inputType = SInt(8.W), + accType = SInt(32.W), + + spatialArrayOutputType = SInt(20.W), + // Spatial array size options tileRows = 1, tileColumns = 1, meshRows = 16, meshColumns = 16, + // Spatial array PE options + dataflow = Dataflow.BOTH, + + // Scratchpad and accumulator + sp_capacity = CapacityInKilobytes(256), + acc_capacity = CapacityInKilobytes(64), + + sp_banks = 4, + acc_banks = 2, + + sp_singleported = true, + acc_singleported = false, + + // DNN options + has_training_convs = true, + has_max_pool = true, + has_nonlinear_activations = true, + + // Reservation station entries + reservation_station_full_entries = 16, + reservation_station_partial_entries = 8, + + // Ld/Ex/St instruction queue lengths ld_queue_length = 8, st_queue_length = 2, ex_queue_length = 8, - rob_full_entries = 16, - rob_partial_entries = 8, + // DMA options + max_in_flight_mem_reqs = 16, - hasIm2col = false, //declare im2col block + dma_maxbytes = 64, + dma_buswidth = 128, - sp_banks = 4, - sp_singleported = true, - acc_banks = 2, - acc_singleported = false, - num_acc_sub_banks = -1, - sp_capacity = CapacityInKilobytes(256), - shifter_banks = 1, // TODO add separate parameters for left and up shifter banks - dataflow = Dataflow.BOTH, - acc_capacity = CapacityInKilobytes(64), - mem_pipeline = 4, - dma_maxbytes = 64, // TODO get this from cacheblockbytes - dma_buswidth = 128, // TODO get this from SystemBusKey - aligned_to = 1, + // TLB options tlb_size = 4, - use_tlb_register_filter = true, - max_in_flight_reqs = 16, - use_dedicated_tl_port = false, - - inputType = SInt(8.W), - outputType = SInt(20.W), - accType = SInt(32.W), + // Mvin and Accumulator scalar multiply options mvin_scale_args = Some(ScaleArguments( (t: SInt, f: Float) => { val f_rec = recFNFromFN(f.expWidth, f.sigWidth, f.bits) @@ -91,10 +102,11 @@ object GemminiConfigs { identity = "1.0", c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (elem_t)y);})" )), + mvin_scale_acc_args = None, mvin_scale_shared = false, - acc_scale_args = ScaleArguments( + acc_scale_args = Some(ScaleArguments( (t: SInt, f: Float) => { val f_rec = recFNFromFN(f.expWidth, f.sigWidth, f.bits) @@ -128,40 +140,37 @@ object GemminiConfigs { Mux(overflow, sat, rec_fn_to_in.io.out.asTypeOf(t)) }, - 1, Float(8, 24), -1, // TODO pipelining should be 5 + 1, Float(8, 24), -1, identity = "1.0", c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (acc_t)y);})" - ), + )), + + // SoC counters options + num_counter = 8, + // Scratchpad and Accumulator input/output options acc_read_full_width = true, acc_read_small_width = true, - pe_latency = 0, - ex_read_from_spad = true, ex_read_from_acc = true, ex_write_to_spad = true, ex_write_to_acc = true, - - hardcode_d_to_garbage_addr = false, - - mesh_output_delay = 1, - - num_counter = 8, ) val chipConfig = defaultConfig.copy(sp_capacity=CapacityInKilobytes(64), acc_capacity=CapacityInKilobytes(32), dataflow=Dataflow.WS, - acc_scale_args=defaultConfig.acc_scale_args.copy(latency=4), + acc_scale_args=Some(defaultConfig.acc_scale_args.get.copy(latency=4)), acc_singleported=true, - num_acc_sub_banks=2, + acc_sub_banks=2, ex_read_from_acc=false, ex_write_to_spad=false ) + val largeChipConfig = chipConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64), meshRows=32, meshColumns=32 ) - val leanConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true) + val leanConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true) } /** diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index 111041fd..a54c2853 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -24,41 +24,39 @@ object GemminiFPConfigs { st_queue_length = 2, ex_queue_length = 8, - rob_full_entries = 16, - rob_partial_entries = 8, - - hasIm2col = false, + reservation_station_full_entries = 16, + reservation_station_partial_entries = 8, sp_banks = 4, sp_singleported = true, acc_banks = 1, acc_singleported = false, - num_acc_sub_banks = -1, + acc_sub_banks = -1, sp_capacity = CapacityInKilobytes(256), shifter_banks = 1, // TODO add separate parameters for left and up shifter banks dataflow = Dataflow.BOTH, acc_capacity = CapacityInKilobytes(64), - mem_pipeline = 1, + spad_read_delay = 1, dma_maxbytes = 64, // TODO get this from cacheblockbytes dma_buswidth = 128, // TODO get this from SystemBusKey aligned_to = 1, tlb_size = 4, use_tlb_register_filter = true, - max_in_flight_reqs = 16, + max_in_flight_mem_reqs = 16, use_dedicated_tl_port = false, inputType = Float(8, 24), - outputType = Float(8, 24), + spatialArrayOutputType = Float(8, 24), accType = Float(8, 24), mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_shared = false, - acc_scale_args = ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", + acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str = "((x) * (scale))" - ), + )), acc_read_full_width = true, acc_read_small_width = true, @@ -73,32 +71,36 @@ object GemminiFPConfigs { mesh_output_delay = 0, + has_training_convs = false, + has_max_pool = true, + has_nonlinear_activations = true, + num_counter = 8, ) //FP32 Single Precision Configuration - val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), outputType = Float(8, 24), accType = Float(8, 24), + val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24), pe_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) //FP16 Half Precision Configuration - val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), outputType = Float(5, 11), accType = Float(8, 24), + val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24), pe_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")), ) //Bfloat16 Brain-half Precision Configuration - val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), outputType = Float(8, 8), accType = Float(8, 24), + val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24), pe_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) //Bfloat16 Brain-half Precision Configuration 8x8 array - val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), outputType = Float(8, 8), accType = Float(8, 24), + val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24), meshRows = 8, meshColumns = 8, pe_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index e63d7451..08481a5c 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -27,6 +27,9 @@ class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiA nPTWPorts = 1) { Files.write(Paths.get(config.headerFilePath), config.generateHeader().getBytes(StandardCharsets.UTF_8)) + if (System.getenv("GEMMINI_ONLY_GENERATE_GEMMINI_H") == "1") { + System.exit(1) + } val xLen = p(XLen) val spad = LazyModule(new Scratchpad(config)) @@ -59,19 +62,13 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // TLB implicit val edge = outer.node.edges.out.head - val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes)) + val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters)) (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2) tlb.io.exp.flush_skip := false.B tlb.io.exp.flush_retry := false.B counters.io.event_io.collect(tlb.io.counter) io.ptw.head <> tlb.io.ptw - /*io.ptw.head.req <> tlb.io.ptw.req - tlb.io.ptw.resp <> io.ptw.head.resp - tlb.io.ptw.ptbr := io.ptw.head.ptbr - tlb.io.ptw.status := outer.spad.module.io.mstatus - tlb.io.ptw.pmp := io.ptw.head.pmp - tlb.io.ptw.customCSRs := io.ptw.head.customCSRs*/ spad.module.io.flush := tlb.io.exp.flush() @@ -114,32 +111,28 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val unrolled_cmd = LoopUnroller(raw_risc_cmd, outer.config.meshRows * outer.config.tileRows) */ - // Incoming commands and ROB - val rob = Module(new ROB(outer.config, new RoCCCommand)) - counters.io.event_io.collect(rob.io.counter) + // Incoming commands and reservation station + val reservation_station = Module(new ReservationStation(outer.config, new RoCCCommand)) + counters.io.event_io.collect(reservation_station.io.counter) val raw_cmd = Queue(io.cmd) - val max_lds = rob_partial_entries - val max_exs = rob_full_entries - val max_sts = rob_partial_entries / 2 + val max_lds = reservation_station_partial_entries + val max_exs = reservation_station_full_entries + val max_sts = reservation_station_partial_entries / 2 // TODO replace 4,12,2 with parameters based on ROB size - val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization, + val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ConfigMvoutRs2(acc_scale_t_bits, 32), new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), - new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t)) - - // val (compressed_cmd, compressor_busy) = InstCompressor(unrolled_cmd) - // compressed_cmd.ready := false.B - - // val (unrolled_cmd, loop_matmul_unroller_busy) = LoopMatmul(unrolled_cmd_after_conv, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization, + new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), + has_training_convs, has_max_pool) - val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, rob.io.ld_utilization, rob.io.st_utilization, rob.io.ex_utilization, + val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), @@ -150,19 +143,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] unrolled_cmd.ready := false.B counters.io.event_io.connectEventSignal(CounterEvent.LOOP_MATMUL_ACTIVE_CYCLES, loop_matmul_unroller_busy) - // val cmd_decompressor = Module(new InstDecompressor(rob_entries)) - - // cmd_decompressor.io.in.valid := rob.io.issue.ex.valid - // cmd_decompressor.io.in.bits.cmd := rob.io.issue.ex.cmd - // cmd_decompressor.io.in.bits.rob_id := rob.io.issue.ex.rob_id - // rob.io.issue.ex.ready := cmd_decompressor.io.in.ready - - // val decompressed_cmd = cmd_decompressor.io.out - // Wire up controllers to ROB - rob.io.alloc.valid := false.B - // rob.io.alloc.bits := compressed_cmd.bits - rob.io.alloc.bits := unrolled_cmd.bits + reservation_station.io.alloc.valid := false.B + reservation_station.io.alloc.bits := unrolled_cmd.bits /* //------------------------------------------------------------------------- @@ -196,9 +179,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] tiler.io.issue.exec.ready := false.B */ - rob.io.issue.ld.ready := false.B - rob.io.issue.st.ready := false.B - rob.io.issue.ex.ready := false.B + reservation_station.io.issue.ld.ready := false.B + reservation_station.io.issue.st.ready := false.B + reservation_station.io.issue.ex.ready := false.B /* when (is_cisc_mode) { @@ -227,23 +210,23 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] } */ - load_controller.io.cmd.valid := rob.io.issue.ld.valid - rob.io.issue.ld.ready := load_controller.io.cmd.ready - load_controller.io.cmd.bits.cmd := rob.io.issue.ld.cmd - load_controller.io.cmd.bits.cmd.inst.funct := rob.io.issue.ld.cmd.inst.funct - load_controller.io.cmd.bits.rob_id.push(rob.io.issue.ld.rob_id) + load_controller.io.cmd.valid := reservation_station.io.issue.ld.valid + reservation_station.io.issue.ld.ready := load_controller.io.cmd.ready + load_controller.io.cmd.bits.cmd := reservation_station.io.issue.ld.cmd + load_controller.io.cmd.bits.cmd.inst.funct := reservation_station.io.issue.ld.cmd.inst.funct + load_controller.io.cmd.bits.rob_id.push(reservation_station.io.issue.ld.rob_id) - store_controller.io.cmd.valid := rob.io.issue.st.valid - rob.io.issue.st.ready := store_controller.io.cmd.ready - store_controller.io.cmd.bits.cmd := rob.io.issue.st.cmd - store_controller.io.cmd.bits.cmd.inst.funct := rob.io.issue.st.cmd.inst.funct - store_controller.io.cmd.bits.rob_id.push(rob.io.issue.st.rob_id) + store_controller.io.cmd.valid := reservation_station.io.issue.st.valid + reservation_station.io.issue.st.ready := store_controller.io.cmd.ready + store_controller.io.cmd.bits.cmd := reservation_station.io.issue.st.cmd + store_controller.io.cmd.bits.cmd.inst.funct := reservation_station.io.issue.st.cmd.inst.funct + store_controller.io.cmd.bits.rob_id.push(reservation_station.io.issue.st.rob_id) - ex_controller.io.cmd.valid := rob.io.issue.ex.valid - rob.io.issue.ex.ready := ex_controller.io.cmd.ready - ex_controller.io.cmd.bits.cmd := rob.io.issue.ex.cmd - ex_controller.io.cmd.bits.cmd.inst.funct := rob.io.issue.ex.cmd.inst.funct - ex_controller.io.cmd.bits.rob_id.push(rob.io.issue.ex.rob_id) + ex_controller.io.cmd.valid := reservation_station.io.issue.ex.valid + reservation_station.io.issue.ex.ready := ex_controller.io.cmd.ready + ex_controller.io.cmd.bits.cmd := reservation_station.io.issue.ex.cmd + ex_controller.io.cmd.bits.cmd.inst.funct := reservation_station.io.issue.ex.cmd.inst.funct + ex_controller.io.cmd.bits.rob_id.push(reservation_station.io.issue.ex.rob_id) // Wire up scratchpad to controllers spad.module.io.dma.read <> load_controller.io.dma @@ -284,9 +267,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] } // Wire up controllers to ROB - rob.io.alloc.valid := false.B + reservation_station.io.alloc.valid := false.B // rob.io.alloc.bits := compressed_cmd.bits - rob.io.alloc.bits := unrolled_cmd.bits + reservation_station.io.alloc.bits := unrolled_cmd.bits /* //========================================================================= @@ -309,28 +292,28 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] //------------------------------------------------------------------------- // risc - val rob_completed_arb = Module(new Arbiter(UInt(log2Up(rob_entries).W), 3)) + val reservation_station_completed_arb = Module(new Arbiter(UInt(log2Up(rob_entries).W), 3)) - rob_completed_arb.io.in(0).valid := ex_controller.io.completed.valid - rob_completed_arb.io.in(0).bits := ex_controller.io.completed.bits + reservation_station_completed_arb.io.in(0).valid := ex_controller.io.completed.valid + reservation_station_completed_arb.io.in(0).bits := ex_controller.io.completed.bits - rob_completed_arb.io.in(1) <> load_controller.io.completed - rob_completed_arb.io.in(2) <> store_controller.io.completed + reservation_station_completed_arb.io.in(1) <> load_controller.io.completed + reservation_station_completed_arb.io.in(2) <> store_controller.io.completed // mux with cisc frontend arbiter - rob_completed_arb.io.in(0).valid := ex_controller.io.completed.valid // && !is_cisc_mode - rob_completed_arb.io.in(1).valid := load_controller.io.completed.valid // && !is_cisc_mode - rob_completed_arb.io.in(2).valid := store_controller.io.completed.valid // && !is_cisc_mode + reservation_station_completed_arb.io.in(0).valid := ex_controller.io.completed.valid // && !is_cisc_mode + reservation_station_completed_arb.io.in(1).valid := load_controller.io.completed.valid // && !is_cisc_mode + reservation_station_completed_arb.io.in(2).valid := store_controller.io.completed.valid // && !is_cisc_mode - rob.io.completed.valid := rob_completed_arb.io.out.valid - rob.io.completed.bits := rob_completed_arb.io.out.bits - rob_completed_arb.io.out.ready := true.B + reservation_station.io.completed.valid := reservation_station_completed_arb.io.out.valid + reservation_station.io.completed.bits := reservation_station_completed_arb.io.out.bits + reservation_station_completed_arb.io.out.ready := true.B // Wire up global RoCC signals - io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || rob.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid + io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || reservation_station.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid io.interrupt := tlb.io.exp.interrupt - rob.io.solitary_preload := ex_controller.io.solitary_preload + reservation_station.io.solitary_preload := ex_controller.io.solitary_preload // assert(!io.interrupt, "Interrupt handlers have not been written yet") @@ -344,7 +327,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val incr_st_ex_cycles = !load_controller.io.busy && store_controller.io.busy && ex_controller.io.busy val incr_ld_st_ex_cycles = load_controller.io.busy && store_controller.io.busy && ex_controller.io.busy - + counters.io.event_io.connectEventSignal(CounterEvent.MAIN_LD_CYCLES, incr_ld_cycles) counters.io.event_io.connectEventSignal(CounterEvent.MAIN_ST_CYCLES, incr_st_cycles) counters.io.event_io.connectEventSignal(CounterEvent.MAIN_EX_CYCLES, incr_ex_cycles) @@ -372,12 +355,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] */ when (is_flush) { - // val skip = compressed_cmd.bits.rs1(0) val skip = unrolled_cmd.bits.rs1(0) tlb.io.exp.flush_skip := skip tlb.io.exp.flush_retry := !skip - // compressed_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB? unrolled_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB? } @@ -387,9 +368,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] } .otherwise { - rob.io.alloc.valid := true.B + reservation_station.io.alloc.valid := true.B - when(rob.io.alloc.fire()) { + when(reservation_station.io.alloc.fire()) { // compressed_cmd.ready := true.B unrolled_cmd.ready := true.B } diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala index 9b75acf2..9f0482f7 100644 --- a/src/main/scala/gemmini/CounterFile.scala +++ b/src/main/scala/gemmini/CounterFile.scala @@ -64,8 +64,8 @@ object CounterEvent { val IM2COL_ACTIVE_CYCLES = 39 val IM2COL_TRANSPOSER_WAIT_CYCLE = 40 - val ROB_FULL_CYCLES = 41 - val ROB_ACTIVE_CYCLES = 42 + val RESERVATION_STATION_FULL_CYCLES = 41 + val RESERVATION_STATION_ACTIVE_CYCLES = 42 val LOOP_MATMUL_ACTIVE_CYCLES = 43 val TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES = 44 @@ -76,14 +76,17 @@ object CounterEvent { object CounterExternal { val DISABLE = 0 - val ROB_LD_COUNT = 1 - val ROB_ST_COUNT = 2 - val ROB_EX_COUNT = 3 + val RESERVATION_STATION_LD_COUNT = 1 + val RESERVATION_STATION_ST_COUNT = 2 + val RESERVATION_STATION_EX_COUNT = 3 val RDMA_BYTES_REC = 4 val WDMA_BYTES_SENT = 5 - val n = 6 + val RDMA_TOTAL_LATENCY = 6 + val WDMA_TOTAL_LATENCY = 7 + + val n = 8 val EXTERNAL_WIDTH = 32 } @@ -145,6 +148,7 @@ class CounterIO(nPerfCounter: Int, counterWidth: Int) extends Bundle { val addr = Input(UInt(log2Ceil(nPerfCounter).W)) val data = Output(UInt(counterWidth.W)) val config_address = Flipped(Valid(UInt(log2Ceil(CounterEvent.n).W))) + val external = Input(Bool()) val event_io = Flipped(new CounterEventIO) } @@ -156,8 +160,9 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module { val io = IO(new CounterIO(nPerfCounter, counterWidth)) - val config_width = log2Ceil(scala.math.max(CounterEvent.n, CounterExternal.n)) + 1; + val config_width = log2Ceil(scala.math.max(CounterEvent.n, CounterExternal.n)) + 1 val counter_config = RegInit(VecInit.tabulate(nPerfCounter)(_ => 0.U(config_width.W))) + val counter_is_external = Reg(Vec(nPerfCounter, Bool())) io.event_io.external_reset := io.counter_reset withReset(reset.asBool || io.counter_reset) { @@ -170,9 +175,10 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module // local counter val take_value = (config: UInt, counter: UInt) => { // Set the width - val external = Wire(UInt(counterWidth.W)) - external := io.event_io.external_values(io.addr) - Mux(config(config_width - 1), external, counter) + val external = io.event_io.external_values(config) + val is_external = counter_is_external(io.addr) + + Mux(is_external, external, counter) } // Snapshot: In case a sequence of access instructions get interrupted (i.e. preempted by OS), it is possible // to take a snapshot when reading counter value by setting a bit in the instruction. All subsequent readings @@ -194,6 +200,7 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module // Write configuration reg when (io.config_address.valid) { counter_config(io.addr) := io.config_address.bits + counter_is_external(io.addr) := io.external counters(io.addr) := 0.U } @@ -241,6 +248,7 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame module.io.snapshot := io.in.bits.rs1(2) & io.in.fire() module.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire() module.io.config_address.bits := io.in.bits.rs1(17, 12) + module.io.external := io.in.bits.rs1(31) when (io.out.fire()) { out_valid_reg := false.B diff --git a/src/main/scala/gemmini/CustomCPUConfigs.scala b/src/main/scala/gemmini/CustomCPUConfigs.scala new file mode 100644 index 00000000..01c32a44 --- /dev/null +++ b/src/main/scala/gemmini/CustomCPUConfigs.scala @@ -0,0 +1,20 @@ +/* +package chipyard + +import boom.common._ +import freechips.rocketchip.subsystem._ + +object CustomGemmminiCPUConfigs { + // Default CPU configs + type RocketBigCores = WithNBigCores + type RocketMedCores = WithNMedCores + type RocketSmallCores = WithNSmallCores + + type BoomLargeCores = WithNLargeBooms + type BoomMedCores = WithNMediumBooms + type BoomSmallCores = WithNMediumBooms + + // Specify which CPU configs you want to build here + type CustomCPU = RocketBigCores +} +*/ \ No newline at end of file diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala new file mode 100644 index 00000000..e1ed7199 --- /dev/null +++ b/src/main/scala/gemmini/CustomConfigs.scala @@ -0,0 +1,60 @@ +package gemmini + +import chipsalliance.rocketchip.config.{Config, Parameters} +import chisel3._ +import freechips.rocketchip.diplomacy.LazyModule +import freechips.rocketchip.subsystem.SystemBusKey +import freechips.rocketchip.tile.BuildRoCC + + +object GemminiCustomConfigs { + // Default configurations + val defaultConfig = GemminiConfigs.defaultConfig + val defaultFpConfig = GemminiFPConfigs.defaultFPConfig + + // Create your own configs here + val baselineInferenceConfig = defaultConfig.copy( + has_training_convs = false, + ) + + val highPerfInferenceConfig = defaultConfig.copy( + meshRows = 32, + meshColumns = 32, + + has_training_convs = false, + + sp_capacity = CapacityInKilobytes(512), + acc_capacity = CapacityInKilobytes(128), + ) + + val trainingConfig = defaultFpConfig.copy( + inputType = Float(expWidth = 8, sigWidth = 24), + accType = Float(expWidth = 8, sigWidth = 24), + + meshRows = 8, + meshColumns = 8, + + has_training_convs = true, + has_max_pool = false, + + sp_capacity = CapacityInKilobytes(512), + acc_capacity = CapacityInKilobytes(128), + ) + + // Specify which of your custom configs you want to build here + val customConfig = baselineInferenceConfig +} + + +class GemminiCustomConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiCustomConfigs.customConfig +) extends Config((site, here, up) => { + case BuildRoCC => up(BuildRoCC) ++ Seq( + (p: Parameters) => { + implicit val q = p + val gemmini = LazyModule(new Gemmini(gemminiConfig)) + gemmini + } + ) + case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) +}) diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala new file mode 100644 index 00000000..aebfb520 --- /dev/null +++ b/src/main/scala/gemmini/CustomSoCConfigs.scala @@ -0,0 +1,24 @@ +/* +package chipyard + +import freechips.rocketchip.config.{Config} + +class CustomGemminiSoCConfig extends Config( + new gemmini.GemminiCustomConfig ++ + + // Set your custom L2 configs + new chipyard.config.WithL2TLBs(512) ++ + + new freechips.rocketchip.subsystem.WithInclusiveCache( + nBanks = 1, + nWays = 8, + capacityKB = 512, + outerLatencyCycles = 40 + ) ++ + + // Set the number of CPUs you want to create + new chipyard.CustomGemmminiCPUConfigs.CustomCPU(1) ++ + + new chipyard.config.AbstractConfig +) +*/ diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index c8e61c1e..c1cb51ef 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -14,6 +14,9 @@ import freechips.rocketchip.rocket.constants.MemoryOpConstants import Util._ +import midas.targetutils.PerfCounter +import midas.targetutils.SynthesizePrintf + class StreamReadRequest[U <: Data](spad_rows: Int, acc_rows: Int, mvin_scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle { val vaddr = UInt(coreMaxAddrBits.W) val spaddr = UInt(log2Up(spad_rows max acc_rows).W) // TODO use LocalAddr in DMA @@ -48,9 +51,9 @@ class StreamReadResponse[U <: Data](spadWidth: Int, accWidth: Int, spad_rows: In } class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int, spadWidth: Int, accWidth: Int, aligned_to: Int, - spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean) + spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean) (implicit p: Parameters) extends LazyModule { - val core = LazyModule(new StreamReaderCore(config, nXacts, beatBits, maxBytes, spadWidth, accWidth, aligned_to, spad_rows, acc_rows, meshRows, use_tlb_register_filter)) + val core = LazyModule(new StreamReaderCore(config, nXacts, beatBits, maxBytes, spadWidth, accWidth, aligned_to, spad_rows, acc_rows, meshRows, use_tlb_register_filter, use_firesim_simulation_counters)) val node = core.node lazy val module = new LazyModuleImp(this) { @@ -67,7 +70,7 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T val nCmds = (nXacts / meshRows) + 1 - val xactTracker = Module(new XactTracker(nXacts, maxBytes, spadWidth, accWidth, spad_rows, acc_rows, maxBytes, config.mvin_scale_t_bits, nCmds)) + val xactTracker = Module(new XactTracker(nXacts, maxBytes, spadWidth, accWidth, spad_rows, acc_rows, maxBytes, config.mvin_scale_t_bits, nCmds, use_firesim_simulation_counters)) val beatPacker = Module(new BeatMerger(beatBits, maxBytes, spadWidth, accWidth, spad_rows, acc_rows, maxBytes, aligned_to, meshRows, config.mvin_scale_t_bits, nCmds)) @@ -102,6 +105,7 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T io.resp.bits.last := beatPacker.io.out.bits.last io.counter.collect(core.module.io.counter) + io.counter.collect(xactTracker.io.counter) } } @@ -115,7 +119,8 @@ class StreamReadBeat (val nXacts: Int, val beatBits: Int, val maxReqBytes: Int) // TODO StreamReaderCore and StreamWriter are actually very alike. Is there some parent class they could both inherit from? class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int, spadWidth: Int, accWidth: Int, aligned_to: Int, - spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean) + spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean, + use_firesim_simulation_counters: Boolean) (implicit p: Parameters) extends LazyModule { val node = TLHelper.makeClientNode( name = "stream-reader", sourceId = IdRange(0, nXacts)) @@ -290,19 +295,32 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf state := s_req_new_block } - // Performance counter CounterEventIO.init(io.counter) io.counter.connectEventSignal(CounterEvent.RDMA_ACTIVE_CYCLE, state =/= s_idle) - val bytes_read = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W)) - io.counter.connectExternalCounter(CounterExternal.RDMA_BYTES_REC, bytes_read) - when (io.counter.external_reset) { - bytes_read := 0.U - } .elsewhen (tl.d.fire()) { - bytes_read := bytes_read + 1.U << tl.d.bits.size - } io.counter.connectEventSignal(CounterEvent.RDMA_TLB_WAIT_CYCLES, io.tlb.resp.miss) io.counter.connectEventSignal(CounterEvent.RDMA_TL_WAIT_CYCLES, tl.a.valid && !tl.a.ready) + + // External counters + val total_bytes_read = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W)) + when (io.counter.external_reset) { + total_bytes_read := 0.U + }.elsewhen (tl.d.fire()) { + total_bytes_read := total_bytes_read + (1.U << tl.d.bits.size) + } + + io.counter.connectExternalCounter(CounterExternal.RDMA_BYTES_REC, total_bytes_read) + + if (use_firesim_simulation_counters) { + PerfCounter(state =/= s_idle, "rdma_active_cycles", "cycles during which the read dma is active") + PerfCounter(tl.a.ready && translate_q.io.deq.valid && io.tlb.resp.miss, "rdma_tlb_wait_cycles", "cycles during which the read dma is stalling as it waits for a TLB response") + PerfCounter(tl.a.valid && !tl.a.ready, "rdma_tl_wait_cycles", "cycles during which the read dma is stalling as it waits for the TileLink port to be available") + + val cntr = Counter(500000) + when (cntr.inc()) { + printf(SynthesizePrintf("RDMA bytes rec: %d\n", total_bytes_read)) + } + } } } @@ -319,7 +337,8 @@ class StreamWriteRequest(val dataWidth: Int, val maxBytes: Int)(implicit p: Para } class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: Int, dataWidth: Int, aligned_to: Int, - inputType: T, block_cols: Int, use_tlb_register_filter: Boolean) + inputType: T, block_cols: Int, use_tlb_register_filter: Boolean, + use_firesim_simulation_counters: Boolean) (implicit p: Parameters) extends LazyModule { val node = TLHelper.makeClientNode( name = "stream-writer", sourceId = IdRange(0, nXacts)) @@ -584,14 +603,36 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: // Performance counter CounterEventIO.init(io.counter) io.counter.connectEventSignal(CounterEvent.WDMA_ACTIVE_CYCLE, state =/= s_idle) - val bytes_sent = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W)) - io.counter.connectExternalCounter(CounterExternal.WDMA_BYTES_SENT, bytes_sent) - when (io.counter.external_reset) { - bytes_sent := 0.U - } .elsewhen (tl.d.fire()) { - bytes_sent := bytes_sent + 1.U << tl.d.bits.size - } io.counter.connectEventSignal(CounterEvent.WDMA_TLB_WAIT_CYCLES, io.tlb.resp.miss) io.counter.connectEventSignal(CounterEvent.WDMA_TL_WAIT_CYCLES, tl.a.valid && !tl.a.ready) + + // External counters + val total_bytes_sent = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W)) + when (tl.d.fire()) { + total_bytes_sent := total_bytes_sent + (1.U << tl.d.bits.size) + } + + val total_latency = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W)) + total_latency := total_latency + PopCount(xactBusy) + + when (io.counter.external_reset) { + total_bytes_sent := 0.U + total_latency := 0.U + } + + io.counter.connectExternalCounter(CounterExternal.WDMA_BYTES_SENT, total_bytes_sent) + io.counter.connectExternalCounter(CounterExternal.WDMA_TOTAL_LATENCY, total_latency) + + if (use_firesim_simulation_counters) { + PerfCounter(state =/= s_idle, "wdma_active_cycles", "cycles during which write read dma is active") + PerfCounter(tl.a.ready && translate_q.io.deq.valid && io.tlb.resp.miss, "wdma_tlb_wait_cycles", "cycles during which the write dma is stalling as it waits for a TLB response") + PerfCounter(tl.a.valid && !tl.a.ready, "wdma_tl_wait_cycles", "cycles during which the write dma is stalling as it waits for the TileLink port to be available") + + val cntr = Counter(500000) + when(cntr.inc()) { + printf(SynthesizePrintf("WDMA bytes sent: %d\n", total_bytes_sent)) + printf(SynthesizePrintf("WDMA total latency: %d\n", total_latency)) + } + } } } diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index 1c9f3b2f..37fc70f4 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -21,30 +21,29 @@ object DSEBaseConfig { ld_queue_length = 4, st_queue_length = 2, ex_queue_length = 8, - rob_full_entries = 8, - rob_partial_entries = 1, + reservation_station_full_entries = 8, + reservation_station_partial_entries = 1, sp_banks = 4, // TODO support one-bank designs acc_banks = 1, acc_singleported = false, - num_acc_sub_banks = -1, + acc_sub_banks = -1, sp_capacity = CapacityInKilobytes(64), sp_singleported = false, shifter_banks = 1, // TODO add separate parameters for left and up shifter banks dataflow = Dataflow.OS, acc_capacity = CapacityInKilobytes(16), - mem_pipeline = 1, + spad_read_delay = 1, dma_maxbytes = 128, // TODO get this from cacheblockbytes dma_buswidth = 128, // TODO get this from SystemBusKey aligned_to = 16, - hasIm2col = false, inputType = SInt(8.W), - outputType = SInt(19.W), + spatialArrayOutputType = SInt(19.W), accType = SInt(32.W), mvin_scale_args = None, mvin_scale_acc_args = None, mvin_scale_shared = false, - acc_scale_args = ScaleArguments( + acc_scale_args = Some(ScaleArguments( (t: SInt, u: UInt) => { // The equation we use can be found here: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm @@ -56,7 +55,7 @@ object DSEBaseConfig { val r = (point_five & (zeros | ones_digit)).asBool() (t >> u).asSInt() + Mux(r, 1.S, 0.S) - }, 0, UInt(8.W), -1), + }, 0, UInt(8.W), -1)), acc_read_full_width = true, acc_read_small_width = true, use_dedicated_tl_port = false, @@ -71,10 +70,14 @@ object DSEBaseConfig { tlb_size = 4, use_tlb_register_filter = true, - max_in_flight_reqs = 16, + max_in_flight_mem_reqs = 16, mesh_output_delay = 1, + has_training_convs = false, + has_max_pool = true, + has_nonlinear_activations = true, + num_counter = 8, ) } @@ -84,9 +87,9 @@ object DSEConfigs{ val baseConfig = base.copy(headerFileName = "gemmini_params_dse1.h") val wsOnlyConfig = baseConfig.copy(dataflow = Dataflow.WS, headerFileName = "gemmini_params_dse2.h") val bothDataflowsConfig = baseConfig.copy(dataflow = Dataflow.BOTH, headerFileName = "gemmini_params_dse3.h") - val highBitwidthConfig = baseConfig.copy(inputType = SInt(32.W), outputType = SInt(32.W), + val highBitwidthConfig = baseConfig.copy(inputType = SInt(32.W), spatialArrayOutputType = SInt(32.W), headerFileName = "gemmini_params_dse4.h") - val largerDimConfig = baseConfig.copy(meshRows = 32, meshColumns = 32, outputType = SInt(20.W), + val largerDimConfig = baseConfig.copy(meshRows = 32, meshColumns = 32, spatialArrayOutputType = SInt(20.W), headerFileName = "gemmini_params_dse5.h") val fullyCombinationalConfig = baseConfig.copy(tileRows = 16, tileColumns = 16, meshRows = 1, meshColumns = 1, headerFileName = "gemmini_params_dse6.h") @@ -97,7 +100,7 @@ object DSEConfigs{ val pnr16Config = baseConfig.copy(sp_capacity = CapacityInKilobytes(256), acc_capacity = CapacityInKilobytes(64), dataflow = Dataflow.BOTH, headerFileName = "gemmini_params_pnr16.h") val pnr32Config = baseConfig.copy(sp_capacity = CapacityInKilobytes(512), acc_capacity = CapacityInKilobytes(128), - meshRows = 32, meshColumns = 32, outputType = SInt(20.W), dataflow = Dataflow.BOTH, + meshRows = 32, meshColumns = 32, spatialArrayOutputType = SInt(20.W), dataflow = Dataflow.BOTH, headerFileName = "gemmini_params_pnr32.h") } diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index db9a894e..9d1cf094 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -6,6 +6,7 @@ import chisel3.util._ import GemminiISA._ import Util._ import freechips.rocketchip.config.Parameters +import midas.targetutils.PerfCounter // TODO do we still need to flush when the dataflow is weight stationary? Won't the result just keep travelling through on its own? class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: Int, config: GemminiArrayConfig[T, U, V]) @@ -28,7 +29,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val acc = new Bundle { val read_req = Vec(acc_banks, Decoupled(new AccumulatorReadReq( - acc_bank_entries, log2Up(accType.getWidth), acc_scale_args.multiplicand_t + acc_bank_entries, log2Up(accType.getWidth), acc_scale_t ))) val read_resp = Flipped(Vec(acc_banks, Decoupled(new AccumulatorScaleResp( @@ -116,9 +117,9 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val im2col_turn = WireInit(0.U(9.W)) val in_shift = Reg(UInt(log2Up(accType.getWidth).W)) - val acc_scale = Reg(acc_scale_args.multiplicand_t) + val acc_scale = Reg(acc_scale_t) val relu6_shift = Reg(UInt(log2Up(accType.getWidth).W)) - val activation = Reg(UInt(2.W)) // TODO magic number + val activation = if (has_nonlinear_activations) Reg(UInt(2.W)) else Activation.NONE // TODO magic number val a_transpose = Reg(Bool()) val bd_transpose = Reg(Bool()) val config_initialized = RegInit(false.B) @@ -136,7 +137,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In "Too many inputs are being fed into the single transposer we have") //fix by input - val im2col_en = hasIm2col.B && weight_stride =/= 0.U + val im2col_en = config.hasIm2Col.B && weight_stride =/= 0.U // SRAM addresses of matmul operands val a_address_rs1 = rs1s(a_address_place).asTypeOf(local_addr_t) @@ -178,7 +179,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val pending_completed_rob_ids = Reg(Vec(2, UDValid(UInt(log2Up(rob_entries).W)))) // Instantiate a queue which queues up signals which must be fed into the mesh - val mesh_cntl_signals_q = Module(new Queue(new ComputeCntlSignals, mem_pipeline+1, + val mesh_cntl_signals_q = Module(new Queue(new ComputeCntlSignals, spad_read_delay+1, pipe=true)) val cntl_ready = mesh_cntl_signals_q.io.enq.ready @@ -186,7 +187,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val cntl = mesh_cntl_signals_q.io.deq.bits // Instantiate the actual mesh - val mesh = Module(new MeshWithDelays(inputType, outputType, accType, mesh_tag, dataflow, pe_latency, mesh_output_delay, + val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, pe_latency, mesh_output_delay, tileRows, tileColumns, meshRows, meshColumns, shifter_banks, shifter_banks)) mesh.io.a.valid := false.B @@ -547,9 +548,11 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val set_only_strides = config_ex_rs1.set_only_strides when (!set_only_strides) { - activation := config_ex_rs1.activation + if (has_nonlinear_activations) { + activation := config_ex_rs1.activation + } in_shift := config_ex_rs2.in_shift - acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_args.multiplicand_t) // TODO magic number + acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_t) // TODO magic number relu6_shift := config_ex_rs2.relu6_shift a_transpose := config_ex_rs1.a_transpose bd_transpose := config_ex_rs1.b_transpose @@ -1025,4 +1028,14 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In !(!cntl.b_fire || mesh.io.b.fire() || !mesh.io.b.ready) && !cntl.b_read_from_acc) io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_D_WAIT_CYCLE, !(!cntl.d_fire || mesh.io.d.fire() || !mesh.io.d.ready) && !cntl.d_read_from_acc) + + if (use_firesim_simulation_counters) { + val ex_flush_cycle = control_state === flushing || control_state === flush + val ex_preload_haz_cycle = cmd.valid(0) && DoPreloads(0) && cmd.valid(1) && raw_hazard_pre + val ex_mulpre_haz_cycle = cmd.valid(0) && DoPreloads(1) && cmd.valid(1) && DoComputes(0) && cmd.valid(2) && raw_hazard_mulpre + + PerfCounter(ex_flush_cycle, "ex_flush_cycle", "cycles during which the ex controller is flushing the spatial array") + PerfCounter(ex_preload_haz_cycle, "ex_preload_haz_cycle", "cycles during which the execute controller is stalling preloads due to hazards") + PerfCounter(ex_mulpre_haz_cycle, "ex_mulpre_haz_cycle", "cycles during which the execute controller is stalling matmuls due to hazards") + } } diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala index f5643c92..50c393b5 100644 --- a/src/main/scala/gemmini/FrontendTLB.scala +++ b/src/main/scala/gemmini/FrontendTLB.scala @@ -7,10 +7,11 @@ import freechips.rocketchip.config.Parameters import freechips.rocketchip.rocket._ import freechips.rocketchip.tile.{CoreBundle, CoreModule} import freechips.rocketchip.tilelink.TLEdgeOut -import freechips.rocketchip.util.InOrderArbiter import Util._ +import midas.targetutils.PerfCounter + class DecoupledTLBReq(val lgMaxSize: Int)(implicit p: Parameters) extends CoreBundle { val tlb_req = new TLBReq(lgMaxSize) val status = new MStatus @@ -25,7 +26,7 @@ class TLBExceptionIO extends Bundle { } // TODO can we make TLB hits only take one cycle? -class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Parameters) +class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters: Boolean)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule { val lgMaxSize = log2Ceil(maxSize) @@ -68,6 +69,12 @@ class DecoupledTLB(entries: Int, maxSize: Int)(implicit edge: TLEdgeOut, p: Para io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, RegNext(io.req.fire()) && !tlb.io.resp.miss) io.counter.connectEventSignal(CounterEvent.DMA_TLB_TOTAL_REQ, io.req.fire()) io.counter.connectEventSignal(CounterEvent.DMA_TLB_MISS_CYCLE, tlb.io.resp.miss) + + if (use_firesim_simulation_counters) { + PerfCounter(RegNext(io.req.fire()) && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits") + PerfCounter(io.req.fire(), "tlb_reqs", "total number of tlb reqs") + PerfCounter(tlb.io.resp.miss, "tlb_miss_cycles", "total number of cycles where the tlb is resolving a miss") + } } class FrontendTLBIO(implicit p: Parameters) extends CoreBundle { @@ -77,7 +84,7 @@ class FrontendTLBIO(implicit p: Parameters) extends CoreBundle { val resp = Flipped(new TLBResp) } -class FrontendTLB(nClients: Int, entries: Int, maxSize: Int) +class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean) (implicit edge: TLEdgeOut, p: Parameters) extends CoreModule { val io = IO(new Bundle { val clients = Flipped(Vec(nClients, new FrontendTLBIO)) @@ -88,7 +95,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int) val lgMaxSize = log2Ceil(coreDataBytes) val tlbArb = Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients)) - val tlb = Module(new DecoupledTLB(entries, maxSize)) + val tlb = Module(new DecoupledTLB(entries, maxSize, use_firesim_simulation_counters)) tlb.io.req.valid := tlbArb.io.out.valid tlb.io.req.bits := tlbArb.io.out.bits tlbArb.io.out.ready := true.B @@ -123,6 +130,11 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int) } .otherwise { client.resp := tlb.io.resp } + + // If we're not using the TLB filter register, then we set this value to always be false + if (!use_tlb_register_filter) { + last_translated_valid := false.B + } } io.counter.collect(tlb.io.counter) diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 3067f00d..45c481ce 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -15,58 +15,74 @@ case class ScaleArguments[T <: Data, U <: Data](scale_func: (T, U) => T, latency identity: String="0", c_str: String="ROUNDING_RIGHT_SHIFT(x, scale)") case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( - opcodes: OpcodeSet, - tileRows: Int, - tileColumns: Int, - meshRows: Int, - meshColumns: Int, - ld_queue_length: Int, - st_queue_length: Int, - ex_queue_length: Int, - rob_full_entries: Int, - rob_partial_entries: Int, - sp_banks: Int, // TODO support one-bank designs - sp_singleported: Boolean, - sp_capacity: GemminiMemCapacity, - acc_banks: Int, - acc_singleported: Boolean, - num_acc_sub_banks: Int, - acc_capacity: GemminiMemCapacity, - shifter_banks: Int, - dataflow: Dataflow.Value, - mem_pipeline: Int, - dma_maxbytes: Int, - dma_buswidth: Int, - aligned_to: Int, // TODO we should align to inputType and accType instead inputType: T, - outputType: T, + spatialArrayOutputType: T, accType: T, - mvin_scale_args: Option[ScaleArguments[T, U]], - mvin_scale_acc_args: Option[ScaleArguments[T, U]], - mvin_scale_shared: Boolean, - acc_scale_args: ScaleArguments[T, V], - hasIm2col: Boolean, - pe_latency: Int, - acc_read_full_width: Boolean, - acc_read_small_width: Boolean, - use_dedicated_tl_port: Boolean, - // enable_a_transpose: Boolean, - // enable_b_transpose: Boolean, - tlb_size: Int, - use_tlb_register_filter: Boolean, - max_in_flight_reqs: Int, + opcodes: OpcodeSet = OpcodeSet.custom3, - ex_read_from_spad: Boolean, - ex_read_from_acc: Boolean, - ex_write_to_spad: Boolean, - ex_write_to_acc: Boolean, + dataflow: Dataflow.Value = Dataflow.BOTH, - hardcode_d_to_garbage_addr: Boolean, + tileRows: Int = 1, + tileColumns: Int = 1, + meshRows: Int = 16, + meshColumns: Int = 16, - mesh_output_delay: Int, + ld_queue_length: Int = 8, + st_queue_length: Int = 2, + ex_queue_length: Int = 8, - num_counter: Int, + reservation_station_full_entries: Int = 16, + reservation_station_partial_entries: Int = 8, + + sp_banks: Int = 4, // TODO support one-bank designs + sp_singleported: Boolean = false, + sp_capacity: GemminiMemCapacity = CapacityInKilobytes(256), + spad_read_delay: Int = 4, + + acc_banks: Int = 2, + acc_singleported: Boolean = false, + acc_sub_banks: Int = -1, + acc_capacity: GemminiMemCapacity = CapacityInKilobytes(64), + + dma_maxbytes: Int = 64, // TODO get this from cacheblockbytes + dma_buswidth: Int = 128, // TODO get this from SystemBusKey + + shifter_banks: Int = 1, // TODO add separate parameters for left and up shifter banks + + aligned_to: Int = 1, // TODO we should align to inputType and accType instead + + mvin_scale_args: Option[ScaleArguments[T, U]] = None, + mvin_scale_acc_args: Option[ScaleArguments[T, U]] = None, + mvin_scale_shared: Boolean = false, + acc_scale_args: Option[ScaleArguments[T, V]] = None, + + pe_latency: Int = 0, + + acc_read_full_width: Boolean = true, + acc_read_small_width: Boolean = true, + use_dedicated_tl_port: Boolean = true, + + tlb_size: Int = 4, + use_tlb_register_filter: Boolean = true, + max_in_flight_mem_reqs: Int = 16, + + ex_read_from_spad: Boolean = true, + ex_read_from_acc: Boolean = true, + ex_write_to_spad: Boolean = true, + ex_write_to_acc: Boolean = true, + + hardcode_d_to_garbage_addr: Boolean = false, + + mesh_output_delay: Int = 1, + + num_counter: Int = 8, + + has_training_convs: Boolean = true, + has_max_pool: Boolean = true, + has_nonlinear_activations: Boolean = true, + + use_firesim_simulation_counters: Boolean = false, headerFileName: String = "gemmini_params.h" ) { @@ -79,7 +95,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( case CapacityInKilobytes(kb) => kb * 1024 * 8 / (acc_banks * meshColumns * tileColumns * accType.getWidth) case CapacityInMatrices(ms) => ms * meshRows * tileRows / acc_banks } - require (!acc_singleported || (num_acc_sub_banks <= 4 && isPow2(num_acc_sub_banks))) + require (!acc_singleported || (acc_sub_banks <= 4 && isPow2(acc_sub_banks))) val local_addr_t = new LocalAddr(sp_banks, sp_bank_entries, acc_banks, acc_bank_entries) @@ -93,13 +109,44 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( case None => Bool() // TODO replace this with UInt(0.W) } - val acc_scale_t = acc_scale_args.multiplicand_t - val mvin_scale_t_bits = mvin_scale_t.getWidth max mvin_scale_acc_t.getWidth val mvin_scale_same = (mvin_scale_args.isEmpty && mvin_scale_acc_args.isEmpty) || mvin_scale_shared + // If the user doesn't specify an "acc_scale_args", then for now, we will still say in the header file that + // acc_scale_t is Float32. TODO: don't put an acc_scale_t in the header file at all if the user doesn't specify one + val acc_scale_t = acc_scale_args match { + case Some(args) => args.multiplicand_t + case None => Float(8, 24) + } + val acc_scale_t_bits = acc_scale_t.getWidth + val acc_scale_identity = acc_scale_args match { + case Some(args) => args.identity + case None => "0" + } + + val acc_scale_c_str = acc_scale_args match { + case Some(args) => args.c_str + case None => "(x)" + } + + val acc_scale_func = acc_scale_args match { + case Some(args) => args.scale_func + case None => (t: T, _: V) => t + } + + val acc_scale_num_units = acc_scale_args match { + case Some(args) => args.num_scale_units + case None => -1 + } + + val acc_scale_latency = acc_scale_args match { + case Some(args) => args.latency + case None => 1 + } + assert(acc_scale_latency > 0) + val mvin_cols_bits = log2Up(((dma_maxbytes / (inputType.getWidth / 8)) max (meshColumns * tileColumns)) + 1) val mvin_rows_bits = log2Up(meshRows * tileRows + 1) val mvout_cols_bits = log2Up(((dma_maxbytes / (inputType.getWidth / 8)) max (meshColumns * tileColumns)) + 1) @@ -108,6 +155,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( val load_states = 3 val block_stride_bits = 16 + val hasIm2Col = false + //========================================================================== // sanity check mesh size //========================================================================== @@ -123,7 +172,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( //========================================================================== // cisc-gemmini miscellaneous constants (some redundant with above) //========================================================================== - val rob_entries = rob_full_entries + rob_partial_entries + val rob_entries = reservation_station_full_entries + reservation_station_partial_entries val ROB_ENTRIES = rob_entries val LOG2_ROB_ENTRIES = log2Up(rob_entries) @@ -206,7 +255,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( assert(dataType.getWidth <= 32) // Above 32 bits, we need to append UL to the number, which isn't done yet dataType match { - case dt: UInt => ("0", BigInt(2).pow(dt.getWidth).-(1).toString) case dt: SInt => ("-" + BigInt(2).pow(dt.getWidth - 1).toString, BigInt(2).pow(dt.getWidth - 1).-(1).toString) case dt: Float => (dt.expWidth, dt.sigWidth) match { @@ -214,13 +262,13 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( case (11, 53) => (scala.Double.MinValue.toString, scala.Double.MaxValue.toString) case _ => (((Range(-1,-(dt.sigWidth),-1).map(-Math.pow(2, _)).foldLeft(-1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString, ((Range(-1,-(dt.sigWidth),-1).map(Math.pow(2, _)).foldLeft(1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString) } - case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown") + case dt => ("0", BigInt(2).pow(dt.getWidth).-(1).toString) + // case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown") } } def c_type(dataType: Data): String = { dataType match { - case dt: UInt => s"uint${dt.getWidth}_t" case dt: SInt => s"int${dt.getWidth}_t" case dt: Float => (dt.expWidth, dt.sigWidth) match { @@ -228,16 +276,17 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( case (11, 53) => "double" case _ => s"uint" + (Math.pow(2, Math.ceil(Math.log(dt.expWidth + dt.sigWidth)/Math.log(2.0)))).toInt.toString + s"_t" } - case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown") + case dt => s"uint${dt.getWidth}_t" } } def full_c_type(dataType: Data): String = { dataType match { - case dt: UInt => "uint64_t" - case dt: SInt => "int64_t" - case dt: Float => "double" - case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown") + case _: UInt => "uint64_t" + case _: SInt => "int64_t" + case _: Float => "double" + case _ => "uint64_t" + // case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown") } } @@ -246,7 +295,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( // assert(Set(8, 16, 32, 64).contains(outputType.getWidth)) assert(Set(8, 16, 32, 64).contains(accType.getWidth)) - val header = new StringBuilder() header ++= s"#ifndef $guard\n" header ++= s"#define $guard\n\n" @@ -311,7 +359,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( header ++= "#define HAS_MVIN_SCALE\n" header ++= s"typedef ${c_type(mvin_scale_args.get.multiplicand_t)} scale_t;\n" header ++= s"typedef ${c_type(UInt(mvin_scale_args.get.multiplicand_t.getWidth.W))} scale_t_bits;\n\n" - } else { header ++= s"typedef int32_t scale_t;\n" header ++= s"typedef uint32_t scale_t_bits;\n\n" @@ -321,14 +368,13 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( header ++= "#define HAS_MVIN_ACC_SCALE\n" header ++= s"typedef ${c_type(mvin_scale_acc_args.get.multiplicand_t)} scale_acc_t;\n" header ++= s"typedef ${c_type(UInt(mvin_scale_acc_args.get.multiplicand_t.getWidth.W))} scale_acc_t_bits;\n\n" - } else { header ++= s"typedef int32_t scale_acc_t;\n" header ++= s"typedef uint32_t scale_acc_t_bits;\n\n" } - header ++= s"typedef ${c_type(acc_scale_args.multiplicand_t)} acc_scale_t;\n" - header ++= s"typedef ${c_type(UInt(acc_scale_args.multiplicand_t.getWidth.W))} acc_scale_t_bits;\n\n" + header ++= s"typedef ${c_type(acc_scale_t)} acc_scale_t;\n" + header ++= s"typedef ${c_type(UInt(acc_scale_t_bits.W))} acc_scale_t_bits;\n\n" header ++= s"#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))\n" header ++= s"#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))\n\n" @@ -338,7 +384,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( case None => "0" } header ++= s"#define MVIN_SCALE_IDENTITY $mvin_scale_identity\n\n" - header ++= s"#define ACC_SCALE_IDENTITY ${acc_scale_args.identity}\n\n" + header ++= s"#define ACC_SCALE_IDENTITY ${acc_scale_identity}\n\n" if (inputType.isInstanceOf[Float]) { header ++= """#define ROUNDING_RIGHT_SHIFT(x, shift) \ @@ -380,7 +426,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( header ++= """#define ACC_SCALE(x, scale) \ """ - header ++= s" ${acc_scale_args.c_str}" + header ++= s" ${acc_scale_c_str}" header ++= "\n\n" if (mvin_scale_args.isDefined) { @@ -388,6 +434,10 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( s"""#define MVIN_SCALE(x, scale) \\ ${mvin_scale_args.get.c_str}""" header ++= "\n\n" + } else { + header ++= + s"""#define MVIN_SCALE(x, scale) (x)""" + header ++= "\n\n" } if (mvin_scale_acc_args.isDefined) { @@ -395,12 +445,16 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( s"""#define MVIN_SCALE_ACC(x, scale) \\ ${mvin_scale_acc_args.get.c_str}""" header ++= "\n\n" + } else { + header ++= + s"""#define MVIN_SCALE_ACC(x, scale) (x)""" + header ++= "\n\n" } - if (acc_scale_args.multiplicand_t.isInstanceOf[Float]) { + if (acc_scale_t.isInstanceOf[Float]) { header ++= "#define ACC_SCALE_T_IS_FLOAT\n" - header ++= s"#define ACC_SCALE_EXP_BITS ${acc_scale_args.multiplicand_t.asInstanceOf[Float].expWidth}\n" - header ++= s"#define ACC_SCALE_SIG_BITS ${acc_scale_args.multiplicand_t.asInstanceOf[Float].sigWidth}\n\n" + header ++= s"#define ACC_SCALE_EXP_BITS ${acc_scale_t.asInstanceOf[Float].expWidth}\n" + header ++= s"#define ACC_SCALE_SIG_BITS ${acc_scale_t.asInstanceOf[Float].sigWidth}\n\n" } if (acc_read_small_width) @@ -436,5 +490,4 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( s"$default_directory/$headerFileName" } } - } diff --git a/src/main/scala/gemmini/Im2Col.scala b/src/main/scala/gemmini/Im2Col.scala index 5088712c..2c7f8cbf 100644 --- a/src/main/scala/gemmini/Im2Col.scala +++ b/src/main/scala/gemmini/Im2Col.scala @@ -135,7 +135,7 @@ class Im2Col[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V val im2col_en_d = RegNext(im2col_en) - val sram_read_signals_q = Module(new Queue(new im2colRowSignals, mem_pipeline+1, + val sram_read_signals_q = Module(new Queue(new im2colRowSignals, spad_read_delay+1, pipe=true)) io.sram_reads.foreach { sr => @@ -444,7 +444,7 @@ class Im2Col[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V sram_read_signals_q.io.enq.bits.sram_bank := im2col_spad_bank sram_read_signals_q.io.deq.ready := true.B//sram_resp_valid - if(!hasIm2col){ //to default values + if(!config.hasIm2Col){ //to default values io.resp.valid := false.B io.req.ready := true.B io.sram_reads.foreach(_.req.valid := false.B) diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala index 1c8b0ced..89f7be7c 100644 --- a/src/main/scala/gemmini/LoadController.scala +++ b/src/main/scala/gemmini/LoadController.scala @@ -6,6 +6,7 @@ import chisel3.util._ import GemminiISA._ import Util._ import freechips.rocketchip.config.Parameters +import midas.targetutils.PerfCounter // TODO we need to check for WAW errors here // TODO deal with errors when reading scratchpad responses @@ -76,7 +77,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig cmd.ready := false.B // Command tracker instantiation - val nCmds = (max_in_flight_reqs / block_rows) + 1 + val nCmds = (max_in_flight_mem_reqs / block_rows) + 1 val deps_t = new Bundle { val rob_id = UInt(log2Up(rob_entries).W) @@ -134,7 +135,6 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig switch (control_state) { is (waiting_for_command) { when (cmd.valid) { - // when(DoConfig && !cmd_tracker.io.cmd_completed.valid) { when(DoConfig) { stride := config_stride scale := config_scale @@ -170,4 +170,11 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig io.counter.connectEventSignal(CounterEvent.LOAD_ACTIVE_CYCLE, control_state === sending_rows) io.counter.connectEventSignal(CounterEvent.LOAD_DMA_WAIT_CYCLE, control_state === waiting_for_dma_req_ready) io.counter.connectEventSignal(CounterEvent.LOAD_SCRATCHPAD_WAIT_CYCLE, io.dma.req.valid && !io.dma.req.ready) + + if (use_firesim_simulation_counters) { + PerfCounter(io.dma.req.valid && !io.dma.req.ready, "load_dma_wait_cycle", "cycles during which load controller is waiting for DMA to be available") + } + + // Assertions + assert(!(cmd_tracker.io.alloc.fire() && cmd_tracker.io.alloc.bits.bytes_to_read === 0.U), "A single mvin instruction must load more than 0 bytes") } diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 749f00fe..47cd5a39 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -274,8 +274,9 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw req.trans_input_3120 -> (req.dram_addr +& (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt()) )) val spad_addr = Mux(req.trans_input_3120, - req.addr_start.zext() +& (b / block_size.S) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample), - req.addr_start.zext() +& (ich / block_size.S) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample)) + // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)" + req.addr_start.zext() +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample), + req.addr_start.zext() +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample)) // Sizes val block_size_downsampled = (block_size.U << req.downsample).asUInt().zext() @@ -1134,7 +1135,8 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, - compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs) + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, + has_training_convs: Boolean, has_max_pool: Boolean) (implicit p: Parameters) extends Module { val large_iterator_bitwidth = 16 val small_iterator_bitwidth = 16 // 8 @@ -1239,9 +1241,9 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I is (LOOP_CONV_WS_CONFIG_2) { loop_being_configured.outer_bounds.kernel_dim := cmd.bits.rs1(63, 48) - loop_being_configured.outer_bounds.pool_size := cmd.bits.rs1(47, 32) - loop_being_configured.outer_bounds.pool_stride := cmd.bits.rs1(31, 16) - loop_being_configured.outer_bounds.pool_padding := cmd.bits.rs1(15, 0) + loop_being_configured.outer_bounds.pool_size := (if (!has_max_pool) 1.U else cmd.bits.rs1(47, 32)) + loop_being_configured.outer_bounds.pool_stride := (if (!has_max_pool) 1.U else cmd.bits.rs1(31, 16)) + loop_being_configured.outer_bounds.pool_padding := (if (!has_max_pool) 0.U else cmd.bits.rs1(15, 0)) loop_being_configured.inner_bounds.batches := cmd.bits.rs2(63, 48) loop_being_configured.inner_bounds.porows := cmd.bits.rs2(47, 32) @@ -1285,17 +1287,19 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I is (LOOP_CONV_WS) { loop_being_configured.no_bias := cmd.bits.rs1(0) - loop_being_configured.wrot180 := cmd.bits.rs1(1) - loop_being_configured.trans_output_1203 := cmd.bits.rs1(2) - loop_being_configured.trans_weight_1203 := cmd.bits.rs1(3) - loop_being_configured.trans_weight_0132 := cmd.bits.rs1(4) - loop_being_configured.trans_input_3120 := cmd.bits.rs1(5) - loop_being_configured.no_pool := cmd.bits.rs2(0) - loop_being_configured.downsample := cmd.bits.rs2(1) - loop_being_configured.input_dilated := cmd.bits.rs2(2) + loop_being_configured.wrot180 := has_training_convs.B && cmd.bits.rs1(1) + loop_being_configured.input_dilated := has_training_convs.B && cmd.bits.rs2(2) + loop_being_configured.trans_output_1203 := has_training_convs.B && cmd.bits.rs1(2) + loop_being_configured.trans_weight_1203 := has_training_convs.B && cmd.bits.rs1(3) + loop_being_configured.trans_weight_0132 := has_training_convs.B && cmd.bits.rs1(4) + loop_being_configured.trans_input_3120 := has_training_convs.B && cmd.bits.rs1(5) + + loop_being_configured.no_pool := !has_max_pool.B || cmd.bits.rs2(0) loop_being_configured.activation := cmd.bits.rs2(4,3) + loop_being_configured.downsample := cmd.bits.rs2(1) + loop_being_configured.configured := true.B // assert(!loop_being_configured.input_dilated || loop_being_configured.outer_bounds.stride === 1.U) @@ -1460,12 +1464,14 @@ object LoopConv { max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, - compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs) + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean) (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = { + val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts, max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes, config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t, - compute_rs1_t, compute_rs2_t)) + compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool)) + mod.io.in <> in mod.io.ld_utilization := ld_utilization mod.io.st_utilization := st_utilization diff --git a/src/main/scala/gemmini/ROB.scala b/src/main/scala/gemmini/ReservationStation.scala similarity index 87% rename from src/main/scala/gemmini/ROB.scala rename to src/main/scala/gemmini/ReservationStation.scala index 4ee23f6a..929685f6 100644 --- a/src/main/scala/gemmini/ROB.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -8,8 +8,12 @@ import freechips.rocketchip.util.PlusArg import GemminiISA._ import Util._ +import midas.targetutils.PerfCounter +import midas.targetutils.SynthesizePrintf + + // TODO unify this class with GemminiCmdWithDeps -class ROBIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bundle { +class ReservationStationIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bundle { val valid = Output(Bool()) val ready = Input(Bool()) val cmd = Output(cmd_t.cloneType) @@ -17,11 +21,11 @@ class ROBIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bundle { def fire(dummy: Int=0) = valid && ready - override def cloneType: this.type = new ROBIssue(cmd_t, rob_entries).asInstanceOf[this.type] + override def cloneType: this.type = new ReservationStationIssue(cmd_t, rob_entries).asInstanceOf[this.type] } // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably -class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], cmd_t: RoCCCommand) extends Module { +class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], cmd_t: RoCCCommand) extends Module { import config._ val block_rows = tileRows * meshRows @@ -33,9 +37,9 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf val completed = Flipped(Valid(UInt(log2Up(rob_entries).W))) val issue = new Bundle { - val ld = new ROBIssue(cmd_t, rob_entries) - val st = new ROBIssue(cmd_t, rob_entries) - val ex = new ROBIssue(cmd_t, rob_entries) + val ld = new ReservationStationIssue(cmd_t, rob_entries) + val st = new ReservationStationIssue(cmd_t, rob_entries) + val ex = new ReservationStationIssue(cmd_t, rob_entries) } val ld_utilization = Output(UInt(log2Up(rob_entries+1).W)) @@ -97,8 +101,8 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf // Debugging signals val allocated_at = UInt(instructions_allocated.getWidth.W) } - val full_entries = Reg(Vec(rob_full_entries, UDValid(new Entry))) - val partial_entries = Reg(Vec(rob_partial_entries, UDValid(new Entry))) + val full_entries = Reg(Vec(reservation_station_full_entries, UDValid(new Entry))) + val partial_entries = Reg(Vec(reservation_station_partial_entries, UDValid(new Entry))) val entries = full_entries ++ partial_entries @@ -122,9 +126,9 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf val new_entry = Wire(new Entry) new_entry := DontCare - val new_full_allocs = Wire(Vec(rob_full_entries, Bool())) + val new_full_allocs = Wire(Vec(reservation_station_full_entries, Bool())) new_full_allocs.foreach(_ := false.B) - val new_partial_allocs = Wire(Vec(rob_partial_entries, Bool())) + val new_partial_allocs = Wire(Vec(reservation_station_partial_entries, Bool())) new_partial_allocs.foreach(_ := false.B) val new_entry_oh = new_full_allocs ++ new_partial_allocs val alloc_fire = io.alloc.fire() @@ -333,8 +337,8 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf new_entry.complete_on_issue := new_entry.is_config && new_entry.q =/= exq val is_full = PopCount(Seq(dst.valid, op1.valid, op2.valid)) > 1.U - val full_alloc_id = MuxCase((rob_full_entries-1).U, full_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U }) - val partial_alloc_id = MuxCase((rob_partial_entries-1).U, partial_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U }) + val full_alloc_id = MuxCase((reservation_station_full_entries-1).U, full_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U }) + val partial_alloc_id = MuxCase((reservation_station_partial_entries-1).U, partial_entries.zipWithIndex.map { case (e, i) => !e.valid -> i.U }) when (!is_full && !partial_entries(partial_alloc_id).valid) { io.alloc.ready := true.B @@ -453,7 +457,7 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf dontTouch(e.bits.allocated_at) } - val cntr = Counter(10000000) + val cntr = Counter(2000000) when (cntr.inc()) { printf(p"Utilization: $utilization\n") printf(p"Utilization ld q (incomplete): $utilization_ld_q_unissued\n") @@ -462,17 +466,33 @@ class ROB[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConf printf(p"Utilization ld q: $utilization_ld_q\n") printf(p"Utilization st q: $utilization_st_q\n") printf(p"Utilization ex q: $utilization_ex_q\n") + + if (use_firesim_simulation_counters) { + printf(SynthesizePrintf("Utilization: %d\n", utilization)) + printf(SynthesizePrintf("Utilization ld q (incomplete): %d\n", utilization_ld_q_unissued)) + printf(SynthesizePrintf("Utilization st q (incomplete): %d\n", utilization_st_q_unissued)) + printf(SynthesizePrintf("Utilization ex q (incomplete): %d\n", utilization_ex_q_unissued)) + printf(SynthesizePrintf("Utilization ld q: %d\n", utilization_ld_q)) + printf(SynthesizePrintf("Utilization st q: %d\n", utilization_st_q)) + printf(SynthesizePrintf("Utilization ex q: %d\n", utilization_ex_q)) + } + printf(p"Packed deps: $packed_deps\n") } + if (use_firesim_simulation_counters) { + PerfCounter(io.busy, "reservation_station_busy", "cycles where reservation station has entries") + PerfCounter(!io.alloc.ready, "reservation_station_full", "cycles where reservation station is full") + } + when (reset.asBool()) { entries.foreach(_.valid := false.B) } CounterEventIO.init(io.counter) - io.counter.connectExternalCounter(CounterExternal.ROB_LD_COUNT, utilization_ld_q) - io.counter.connectExternalCounter(CounterExternal.ROB_ST_COUNT, utilization_st_q) - io.counter.connectExternalCounter(CounterExternal.ROB_EX_COUNT, utilization_ex_q) - io.counter.connectEventSignal(CounterEvent.ROB_ACTIVE_CYCLES, io.busy) - io.counter.connectEventSignal(CounterEvent.ROB_FULL_CYCLES, !io.alloc.ready) + io.counter.connectExternalCounter(CounterExternal.RESERVATION_STATION_LD_COUNT, utilization_ld_q) + io.counter.connectExternalCounter(CounterExternal.RESERVATION_STATION_ST_COUNT, utilization_st_q) + io.counter.connectExternalCounter(CounterExternal.RESERVATION_STATION_EX_COUNT, utilization_ex_q) + io.counter.connectEventSignal(CounterEvent.RESERVATION_STATION_ACTIVE_CYCLES, io.busy) + io.counter.connectEventSignal(CounterEvent.RESERVATION_STATION_FULL_CYCLES, !io.alloc.ready) } diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index f9b6293f..e3289b7f 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -160,10 +160,12 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val id_node = TLIdentityNode() val xbar_node = TLXbar() - val reader = LazyModule(new StreamReader(config, max_in_flight_reqs, dataBits, maxBytes, spad_w, acc_w, aligned_to, - sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, block_rows, use_tlb_register_filter)) - val writer = LazyModule(new StreamWriter(max_in_flight_reqs, dataBits, maxBytes, - if (acc_read_full_width) acc_w else spad_w, aligned_to, inputType, block_cols, use_tlb_register_filter)) + val reader = LazyModule(new StreamReader(config, max_in_flight_mem_reqs, dataBits, maxBytes, spad_w, acc_w, aligned_to, + sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, block_rows, use_tlb_register_filter, + use_firesim_simulation_counters)) + val writer = LazyModule(new StreamWriter(max_in_flight_mem_reqs, dataBits, maxBytes, + if (acc_read_full_width) acc_w else spad_w, aligned_to, inputType, block_cols, use_tlb_register_filter, + use_firesim_simulation_counters)) // TODO make a cross-bar vs two separate ports a config option // id_node :=* reader.node @@ -191,7 +193,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // Accumulator ports val acc = new Bundle { val read_req = Flipped(Vec(acc_banks, Decoupled(new AccumulatorReadReq( - acc_bank_entries, log2Up(accType.getWidth), acc_scale_args.multiplicand_t + acc_bank_entries, log2Up(accType.getWidth), acc_scale_t.asInstanceOf[V] )))) val read_resp = Vec(acc_banks, Decoupled(new AccumulatorScaleResp( Vec(meshColumns, Vec(tileColumns, inputType)), @@ -216,9 +218,9 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // Write scale queue is necessary to maintain in-order requests to accumulator scale unit // Writes from main SPAD just flow directly between scale_q and issue_q, while writes // From acc are ordered - val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), mem_pipeline)) - val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), mem_pipeline+1, pipe=true)) - val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), mem_pipeline+1, pipe=true)) // TODO can't this just be a normal queue? + val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay)) + val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay+1, pipe=true)) + val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), spad_read_delay+1, pipe=true)) // TODO can't this just be a normal queue? write_scale_q.io.enq.valid := false.B write_scale_q.io.enq.bits := write_dispatch_q.bits @@ -409,8 +411,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, ex_read_resp.valid := bio.read.resp.valid && !bio.read.resp.bits.fromDMA ex_read_resp.bits := bio.read.resp.bits - val dma_read_pipe = Pipeline(dma_read_resp, mem_pipeline) - val ex_read_pipe = Pipeline(ex_read_resp, mem_pipeline) + val dma_read_pipe = Pipeline(dma_read_resp, spad_read_delay) + val ex_read_pipe = Pipeline(ex_read_resp, spad_read_delay) bio.read.resp.ready := Mux(bio.read.resp.bits.fromDMA, dma_read_resp.ready, ex_read_resp.ready) @@ -478,11 +480,14 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val acc_scale_unit = Module(new AccumulatorScale( acc_row_t, spad_row_t, - acc_scale_args.multiplicand_t, + acc_scale_t.asInstanceOf[V], log2Up(accType.getWidth), acc_read_small_width, acc_read_full_width, - acc_scale_args + acc_scale_func, + acc_scale_num_units, + acc_scale_latency, + has_nonlinear_activations, )) acc_scale_unit.io.in.valid := false.B @@ -511,8 +516,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, { val banks = Seq.fill(acc_banks) { Module(new AccumulatorMem( - acc_bank_entries, acc_row_t, acc_scale_args, - acc_singleported, num_acc_sub_banks + acc_bank_entries, acc_row_t, acc_scale_func, acc_scale_t.asInstanceOf[V], + acc_singleported, acc_sub_banks )) } val bank_ios = VecInit(banks.map(_.io)) @@ -610,13 +615,13 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) val consecutive_write_block = RegInit(false.B) if (acc_singleported) { - val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(num_acc_sub_banks)).W)) + val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(acc_sub_banks)).W)) when (bio.write.fire() && bio.write.bits.acc && - (bio.write.bits.addr(log2Ceil(num_acc_sub_banks)-1,0) === consecutive_write_sub_bank)) { + (bio.write.bits.addr(log2Ceil(acc_sub_banks)-1,0) === consecutive_write_sub_bank)) { consecutive_write_block := true.B } .elsewhen (bio.write.fire() && bio.write.bits.acc) { consecutive_write_block := false.B - consecutive_write_sub_bank := bio.write.bits.addr(log2Ceil(num_acc_sub_banks)-1,0) + consecutive_write_sub_bank := bio.write.bits.addr(log2Ceil(acc_sub_banks)-1,0) } .otherwise { consecutive_write_block := false.B } diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala index 50efcfe5..28de72c3 100644 --- a/src/main/scala/gemmini/StoreController.scala +++ b/src/main/scala/gemmini/StoreController.scala @@ -7,6 +7,7 @@ import chisel3.experimental._ import GemminiISA._ import Util._ import freechips.rocketchip.config.Parameters +import midas.targetutils.PerfCounter // TODO this is almost a complete copy of LoadController. We should combine them into one class // TODO deal with errors when reading scratchpad responses @@ -42,7 +43,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1 val activation = Reg(UInt(GemminiISA.CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W)) - val acc_scale = Reg(acc_scale_args.multiplicand_t) + val acc_scale = Reg(acc_scale_t) //val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) val row_counter = RegInit(0.U(12.W)) // TODO magic number @@ -64,7 +65,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val wrow_counter = RegInit(0.U(pool_size.getWidth.W)) val wcol_counter = RegInit(0.U(pool_size.getWidth.W)) - val pooling_is_enabled = pool_stride =/= 0.U + val pooling_is_enabled = has_max_pool.B && pool_stride =/= 0.U val mvout_1d_enabled = pool_size =/= 0.U && !pooling_is_enabled //1-D move out enabled (no pooling) val orow = porow_counter * pool_stride +& wrow_counter - pool_upad // TODO get rid of this multiplication @@ -118,7 +119,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val mvout_1d_rows = pool_orows * pool_ocols //for 1D mvout // Command tracker instantiation - val nCmds = (max_in_flight_reqs / block_rows) + 1 + val nCmds = (max_in_flight_mem_reqs / block_rows) + 1 val deps_t = new Bundle { val rob_id = UInt(log2Up(rob_entries).W) @@ -200,7 +201,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm activation := config_activation when (!config_acc_scale.asUInt().andR()) { - acc_scale := config_acc_scale.asTypeOf(acc_scale_args.multiplicand_t) + acc_scale := config_acc_scale.asTypeOf(acc_scale_t) } pool_size := config_pool_size @@ -265,4 +266,9 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm io.counter.connectEventSignal(CounterEvent.STORE_POOLING_CYCLE, pooling_is_enabled) io.counter.connectEventSignal(CounterEvent.STORE_DMA_WAIT_CYCLE, control_state === waiting_for_dma_req_ready) io.counter.connectEventSignal(CounterEvent.STORE_SCRATCHPAD_WAIT_CYCLE, io.dma.req.valid && !io.dma.req.ready) + + if (use_firesim_simulation_counters) { + PerfCounter(pooling_is_enabled, "pooling_cycles", "cycles during which store controller is max-pooling") + PerfCounter(io.dma.req.valid && !io.dma.req.ready, "st_dma_wait_cycle", "cycles during which store controller is stalling for the DMA to be ready") + } } diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala index 90a3394a..0bac0e5b 100644 --- a/src/main/scala/gemmini/TransposePreloadUnroller.scala +++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala @@ -5,6 +5,7 @@ import chisel3.util._ import chisel3.experimental.ChiselEnum import chipsalliance.rocketchip.config.Parameters import Util._ +import midas.targetutils.PerfCounter class TransposePreloadUnroller[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V]) (implicit p: Parameters) extends Module { diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala index efdd7636..e8581a26 100644 --- a/src/main/scala/gemmini/XactTracker.scala +++ b/src/main/scala/gemmini/XactTracker.scala @@ -3,6 +3,7 @@ package gemmini import chisel3._ import chisel3.util._ import gemmini.Util.UDValid +import midas.targetutils.SynthesizePrintf class XactTrackerEntry[U <: Data](maxShift: Int, spadWidth: Int, accWidth: Int, spadRows: Int, accRows: Int, maxReqBytes: Int, mvin_scale_t_bits: Int, @@ -53,11 +54,14 @@ class XactTrackerPeekIO[U <: Data](val nXacts: Int, val maxShift: Int, val spadW maxMatrices: the maximum number of rows from different matrices which can be packed into one request */ class XactTracker[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidth: Int, - spadRows: Int, accRows: Int, maxReqBytes: Int, mvin_scale_t_bits: Int, nCmds: Int) extends Module { + spadRows: Int, accRows: Int, maxReqBytes: Int, mvin_scale_t_bits: Int, nCmds: Int, + use_firesim_simulation_counters: Boolean) extends Module { val io = IO(new Bundle { val alloc = Flipped(new XactTrackerAllocIO(nXacts, maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds)) val peek = new XactTrackerPeekIO(nXacts, maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds) val busy = Output(Bool()) + + val counter = new CounterEventIO() }) val entries = Reg(Vec(nXacts, UDValid(new XactTrackerEntry(maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds)))) @@ -83,4 +87,23 @@ class XactTracker[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidt when (reset.asBool()) { entries.foreach(_.valid := false.B) } + + // Performance counters + CounterEventIO.init(io.counter) + + val total_latency = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W)) + when (io.counter.external_reset) { + total_latency := 0.U + }.otherwise { + total_latency := total_latency + PopCount(entries.map(_.valid)) + } + + io.counter.connectExternalCounter(CounterExternal.RDMA_TOTAL_LATENCY, total_latency) + + if (use_firesim_simulation_counters) { + val cntr = Counter(500000) + when(cntr.inc()) { + printf(SynthesizePrintf("RDMA total latency: %d\n", total_latency)) + } + } } From 73484616cd70f75d05d824efe2f6604b88459f5d Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Fri, 12 Nov 2021 14:31:23 -0800 Subject: [PATCH 05/11] Add Option to Use Two Separate TLBs for Read and Write DMAs (#135) By default, there will be just one TLB shared by both the read and write DMAs --- src/main/scala/gemmini/Controller.scala | 22 +++--- src/main/scala/gemmini/FrontendTLB.scala | 75 +++++++++++---------- src/main/scala/gemmini/GemminiConfigs.scala | 1 + 3 files changed, 54 insertions(+), 44 deletions(-) diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index 08481a5c..f1de9486 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -24,7 +24,7 @@ class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiA (implicit p: Parameters) extends LazyRoCC ( opcodes = config.opcodes, - nPTWPorts = 1) { + nPTWPorts = if (config.use_shared_tlb) 1 else 2) { Files.write(Paths.get(config.headerFilePath), config.generateHeader().getBytes(StandardCharsets.UTF_8)) if (System.getenv("GEMMINI_ONLY_GENERATE_GEMMINI_H") == "1") { @@ -62,15 +62,17 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // TLB implicit val edge = outer.node.edges.out.head - val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters)) + val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters, use_shared_tlb)) (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2) - tlb.io.exp.flush_skip := false.B - tlb.io.exp.flush_retry := false.B - counters.io.event_io.collect(tlb.io.counter) - io.ptw.head <> tlb.io.ptw + tlb.io.exp.foreach(_.flush_skip := false.B) + tlb.io.exp.foreach(_.flush_retry := false.B) + + io.ptw <> tlb.io.ptw - spad.module.io.flush := tlb.io.exp.flush() + counters.io.event_io.collect(tlb.io.counter) + + spad.module.io.flush := tlb.io.exp.map(_.flush()).reduce(_ || _) /* //========================================================================= @@ -311,7 +313,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // Wire up global RoCC signals io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || reservation_station.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid - io.interrupt := tlb.io.exp.interrupt + io.interrupt := tlb.io.exp.map(_.interrupt).reduce(_ || _) reservation_station.io.solitary_preload := ex_controller.io.solitary_preload @@ -356,8 +358,8 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] when (is_flush) { val skip = unrolled_cmd.bits.rs1(0) - tlb.io.exp.flush_skip := skip - tlb.io.exp.flush_retry := !skip + tlb.io.exp.foreach(_.flush_skip := skip) + tlb.io.exp.foreach(_.flush_retry := !skip) unrolled_cmd.ready := true.B // TODO should we wait for an acknowledgement from the TLB? } diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala index 50c393b5..269409fc 100644 --- a/src/main/scala/gemmini/FrontendTLB.scala +++ b/src/main/scala/gemmini/FrontendTLB.scala @@ -84,51 +84,66 @@ class FrontendTLBIO(implicit p: Parameters) extends CoreBundle { val resp = Flipped(new TLBResp) } -class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean) +class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean, use_shared_tlb: Boolean) (implicit edge: TLEdgeOut, p: Parameters) extends CoreModule { + + val num_tlbs = if (use_shared_tlb) 1 else nClients + val lgMaxSize = log2Ceil(coreDataBytes) + val io = IO(new Bundle { val clients = Flipped(Vec(nClients, new FrontendTLBIO)) - val ptw = new TLBPTWIO - val exp = new TLBExceptionIO + val ptw = Vec(num_tlbs, new TLBPTWIO) + val exp = Vec(num_tlbs, new TLBExceptionIO) val counter = new CounterEventIO() }) - val lgMaxSize = log2Ceil(coreDataBytes) - val tlbArb = Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients)) - val tlb = Module(new DecoupledTLB(entries, maxSize, use_firesim_simulation_counters)) - tlb.io.req.valid := tlbArb.io.out.valid - tlb.io.req.bits := tlbArb.io.out.bits - tlbArb.io.out.ready := true.B + val tlbs = Seq.fill(num_tlbs)(Module(new DecoupledTLB(entries, maxSize, use_firesim_simulation_counters))) - io.ptw <> tlb.io.ptw - io.exp <> tlb.io.exp + io.ptw <> VecInit(tlbs.map(_.io.ptw)) + io.exp <> VecInit(tlbs.map(_.io.exp)) + + val tlbArbOpt = if (use_shared_tlb) Some(Module(new RRArbiter(new DecoupledTLBReq(lgMaxSize), nClients))) else None + + if (use_shared_tlb) { + val tlbArb = tlbArbOpt.get + val tlb = tlbs.head + tlb.io.req.valid := tlbArb.io.out.valid + tlb.io.req.bits := tlbArb.io.out.bits + tlbArb.io.out.ready := true.B + } - io.clients.zip(tlbArb.io.in).foreach { case (client, req) => + io.clients.zipWithIndex.foreach { case (client, i) => val last_translated_valid = RegInit(false.B) val last_translated_vpn = RegInit(0.U(vaddrBits.W)) val last_translated_ppn = RegInit(0.U(paddrBits.W)) - val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits) === (last_translated_vpn >> pgIdxBits)) + val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits).asUInt() === (last_translated_vpn >> pgIdxBits).asUInt()) val l0_tlb_paddr = Cat(last_translated_ppn >> pgIdxBits, client.req.bits.tlb_req.vaddr(pgIdxBits-1,0)) - when (req.fire() && !tlb.io.resp.miss) { + val tlb = if (use_shared_tlb) tlbs.head else tlbs(i) + val tlbReq = if (use_shared_tlb) tlbArbOpt.get.io.in(i).bits else tlb.io.req.bits + val tlbReqValid = if (use_shared_tlb) tlbArbOpt.get.io.in(i).valid else tlb.io.req.valid + val tlbReqFire = if (use_shared_tlb) tlbArbOpt.get.io.in(i).fire() else tlb.io.req.fire() + + tlbReqValid := RegNext(client.req.valid && !l0_tlb_hit) + tlbReq := RegNext(client.req.bits) + + when (tlbReqFire && !tlb.io.resp.miss) { last_translated_valid := true.B - last_translated_vpn := req.bits.tlb_req.vaddr + last_translated_vpn := tlbReq.tlb_req.vaddr last_translated_ppn := tlb.io.resp.paddr } - when (io.exp.flush()) { + + when (tlb.io.exp.flush()) { last_translated_valid := false.B } - req.valid := RegNext(client.req.valid && !l0_tlb_hit) - req.bits := RegNext(client.req.bits) - - when (!req.fire()) { + when (tlbReqFire) { + client.resp := tlb.io.resp + }.otherwise { client.resp := DontCare client.resp.paddr := RegNext(l0_tlb_paddr) client.resp.miss := !RegNext(l0_tlb_hit) - } .otherwise { - client.resp := tlb.io.resp } // If we're not using the TLB filter register, then we set this value to always be false @@ -137,16 +152,8 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_fi } } - io.counter.collect(tlb.io.counter) + // TODO Return the sum of the TLB counters, rather than just the counters of the first TLB. This only matters if we're + // not using the shared TLB + tlbs.foreach(_.io.counter.external_reset := false.B) + io.counter.collect(tlbs.head.io.counter) } - -/*class TLBArb (nClients: Int, lgMaxSize: Int)(implicit p: Parameters) extends CoreModule { - val io = IO(new Bundle { - val in_req = Vec(nClients, Flipped(Decoupled(new TLBReq(lgMaxSize)))) - val in_resp = Vec(nClients, Flipped(Valid(new TLBResp))) - val out_req = Decoupled(new TLBReq(lgMaxSize)) - val out_resp = Valid(new TLBResp) - }) - - val priority = Reg(UInt(log2Up(nClients).W)) -}*/ diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 45c481ce..041bfcd0 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -73,6 +73,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( ex_write_to_acc: Boolean = true, hardcode_d_to_garbage_addr: Boolean = false, + use_shared_tlb: Boolean = true, mesh_output_delay: Int = 1, From b9ff1540c9043b7600be37cded897f4bb0a81897 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 30 Nov 2021 21:32:38 -0800 Subject: [PATCH 06/11] Chip optimizations (#83) * Support single-porting accumulator through the use of accumulator "sub-banks" * Support clock-gating Gemmini modules * Support sharing SPAD/ACC between Int8 and FP gemminis * Reduce bitwidths of loop unroller multipliers and adders * Fix error where small portion of scratchpad was unusable when double-buffering in the loop unrollers When single-porting the accumulator banks, input-dilated convs will sometimes fail because they keep writing to the same accumulator banks. A different write pattern will have to be found eventually for those cases, but that's outside the scope of this PR. --- src/main/scala/gemmini/AccumulatorMem.scala | 253 ++++++++++++------ src/main/scala/gemmini/Configs.scala | 83 +++++- src/main/scala/gemmini/ConfigsFP.scala | 5 +- src/main/scala/gemmini/Controller.scala | 37 ++- src/main/scala/gemmini/DSEConfigs.scala | 5 +- src/main/scala/gemmini/GemminiConfigs.scala | 12 +- src/main/scala/gemmini/GemminiISA.scala | 2 + src/main/scala/gemmini/LocalAddr.scala | 10 + src/main/scala/gemmini/LoopConv.scala | 27 +- src/main/scala/gemmini/LoopMatmul.scala | 25 +- src/main/scala/gemmini/Scratchpad.scala | 89 ++++-- src/main/scala/gemmini/SharedExtMem.scala | 80 ++++++ .../gemmini/VectorScalarMultiplier.scala | 4 +- 13 files changed, 488 insertions(+), 144 deletions(-) create mode 100644 src/main/scala/gemmini/SharedExtMem.scala diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala index 89a39182..8f3fbaf5 100644 --- a/src/main/scala/gemmini/AccumulatorMem.scala +++ b/src/main/scala/gemmini/AccumulatorMem.scala @@ -44,17 +44,59 @@ class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends override def cloneType: this.type = new AccumulatorWriteReq(n, t).asInstanceOf[this.type] } -class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U) extends Bundle { + +class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U, + acc_sub_banks: Int, use_shared_ext_mem: Boolean +) extends Bundle { val read = Flipped(new AccumulatorReadIO(n, log2Ceil(t.head.head.getWidth), t, scale_t)) - // val write = Flipped(new AccumulatorWriteIO(n, t)) val write = Flipped(Decoupled(new AccumulatorWriteReq(n, t))) - override def cloneType: this.type = new AccumulatorMemIO(n, t, scale_t).asInstanceOf[this.type] + val ext_mem = if (use_shared_ext_mem) Some(Vec(acc_sub_banks, new ExtMemIO)) else None + + val adder = new Bundle { + val valid = Output(Bool()) + val op1 = Output(t.cloneType) + val op2 = Output(t.cloneType) + val sum = Input(t.cloneType) + } + + override def cloneType: this.type = new AccumulatorMemIO(n, t, scale_t, acc_sub_banks, use_shared_ext_mem).asInstanceOf[this.type] +} + +class AccPipe[T <: Data : Arithmetic](latency: Int, t: T)(implicit ev: Arithmetic[T]) extends Module { + val io = IO(new Bundle { + val op1 = Input(t.cloneType) + val op2 = Input(t.cloneType) + val sum = Output(t.cloneType) + }) + import ev._ + io.sum := ShiftRegister(io.op1 + io.op2, latency) +} + +class AccPipeShared[T <: Data : Arithmetic](latency: Int, t: Vec[Vec[T]], banks: Int) extends Module { + val io = IO(new Bundle { + val in_sel = Input(Vec(banks, Bool())) + val ina = Input(Vec(banks, t.cloneType)) + val inb = Input(Vec(banks, t.cloneType)) + val out = Output(t.cloneType) + }) + val ina = Mux1H(io.in_sel, io.ina) + val inb = Mux1H(io.in_sel, io.inb) + io.out := VecInit((ina zip inb).map { case (rv, wv) => + VecInit((rv zip wv).map { case (re, we) => + val m = Module(new AccPipe(latency, t.head.head.cloneType)) + m.io.op1 := re + m.io.op2 := we + m.io.sum + }) + }) } class AccumulatorMem[T <: Data, U <: Data]( - n: Int, t: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, - acc_singleported: Boolean, acc_sub_banks: Int + n: Int, t: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, + acc_singleported: Boolean, acc_sub_banks: Int, + use_shared_ext_mem: Boolean, + acc_latency: Int, acc_type: T ) (implicit ev: Arithmetic[T]) extends Module { // TODO Do writes in this module work with matrices of size 2? If we try to read from an address right after writing @@ -69,54 +111,91 @@ class AccumulatorMem[T <: Data, U <: Data]( import ev._ // TODO unify this with TwoPortSyncMemIO - val io = IO(new AccumulatorMemIO(n, t, scale_t)) - - - // For any write operation, we spend 2 cycles reading the existing address out, buffering it in a register, and then - // accumulating on top of it (if necessary) - val wdata_buf = ShiftRegister(io.write.bits.data, 2) - val waddr_buf = ShiftRegister(io.write.bits.addr, 2) - val acc_buf = ShiftRegister(io.write.bits.acc, 2) - val mask_buf = ShiftRegister(io.write.bits.mask, 2) - val w_buf_valid = ShiftRegister(io.write.fire(), 2) - val acc_rdata = Wire(t) - acc_rdata := DontCare - val read_rdata = Wire(t) - read_rdata := DontCare + val io = IO(new AccumulatorMemIO(n, t, scale_t, acc_sub_banks, use_shared_ext_mem)) + + require (acc_latency >= 2) + + val pipelined_writes = Reg(Vec(acc_latency, Valid(new AccumulatorWriteReq(n, t)))) + val oldest_pipelined_write = pipelined_writes(acc_latency-1) + pipelined_writes(0).valid := io.write.fire() + pipelined_writes(0).bits := io.write.bits + for (i <- 1 until acc_latency) { + pipelined_writes(i) := pipelined_writes(i-1) + } + + val rdata_for_adder = Wire(t) + rdata_for_adder := DontCare + val rdata_for_read_resp = Wire(t) + rdata_for_read_resp := DontCare + + val adder_sum = io.adder.sum + io.adder.valid := pipelined_writes(0).valid && pipelined_writes(0).bits.acc + io.adder.op1 := rdata_for_adder + io.adder.op2 := pipelined_writes(0).bits.data + val block_read_req = WireInit(false.B) - val w_sum = VecInit((RegNext(acc_rdata) zip wdata_buf).map { case (rv, wv) => - VecInit((rv zip wv).map(t => t._1 + t._2)) - }) + val block_write_req = WireInit(false.B) + + val mask_len = t.getWidth / 8 + val mask_elem = UInt((t.getWidth / mask_len).W) if (!acc_singleported) { - val mem = TwoPortSyncMem(n, t, t.getWidth / 8) // TODO We assume byte-alignment here. Use aligned_to instead - mem.io.waddr := waddr_buf - mem.io.wen := w_buf_valid - mem.io.wdata := Mux(acc_buf, w_sum, wdata_buf) - mem.io.mask := mask_buf - acc_rdata := mem.io.rdata - read_rdata := mem.io.rdata + require(!use_shared_ext_mem) + val mem = TwoPortSyncMem(n, t, mask_len) // TODO We assume byte-alignment here. Use aligned_to instead + mem.io.waddr := oldest_pipelined_write.bits.addr + mem.io.wen := oldest_pipelined_write.valid + mem.io.wdata := Mux(oldest_pipelined_write.bits.acc, adder_sum, oldest_pipelined_write.bits.data) + mem.io.mask := oldest_pipelined_write.bits.mask + rdata_for_adder := mem.io.rdata + rdata_for_read_resp := mem.io.rdata mem.io.raddr := Mux(io.write.fire() && io.write.bits.acc, io.write.bits.addr, io.read.req.bits.addr) mem.io.ren := io.read.req.fire() || (io.write.fire() && io.write.bits.acc) } else { - val mask_len = t.getWidth / 8 - val mask_elem = UInt((t.getWidth / mask_len).W) - val reads = Wire(Vec(2, Decoupled(UInt()))) - reads(0).valid := io.write.valid && io.write.bits.acc - reads(0).bits := io.write.bits.addr - reads(0).ready := true.B - reads(1).valid := io.read.req.valid - reads(1).bits := io.read.req.bits.addr - reads(1).ready := true.B - block_read_req := !reads(1).ready + val rmw_req = Wire(Decoupled(UInt())) + rmw_req.valid := io.write.valid && io.write.bits.acc + rmw_req.bits := io.write.bits.addr + rmw_req.ready := true.B + + block_write_req := !rmw_req.ready + + val only_read_req = Wire(Decoupled(UInt())) + only_read_req.valid := io.read.req.valid + only_read_req.bits := io.read.req.bits.addr + only_read_req.ready := true.B + + block_read_req := !only_read_req.ready + for (i <- 0 until acc_sub_banks) { def isThisBank(addr: UInt) = addr(log2Ceil(acc_sub_banks)-1,0) === i.U - def getBankIdx(addr: UInt): UInt = (addr >> log2Ceil(acc_sub_banks)).asUInt() - val mem = SyncReadMem(n / acc_sub_banks, Vec(mask_len, mask_elem)) + def getBankIdx(addr: UInt) = addr >> log2Ceil(acc_sub_banks) + val (read, write) = if (use_shared_ext_mem) { + def read(addr: UInt, ren: Bool): Data = { + io.ext_mem.get(i).read_en := ren + io.ext_mem.get(i).read_addr := addr + io.ext_mem.get(i).read_data + } + io.ext_mem.get(i).write_en := false.B + io.ext_mem.get(i).write_addr := DontCare + io.ext_mem.get(i).write_data := DontCare + io.ext_mem.get(i).write_mask := DontCare + def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = { + io.ext_mem.get(i).write_en := true.B + io.ext_mem.get(i).write_addr := addr + io.ext_mem.get(i).write_data := wdata.asUInt + io.ext_mem.get(i).write_mask := wmask.asUInt + } + (read _, write _) + } else { + val mem = SyncReadMem(n / acc_sub_banks, Vec(mask_len, mask_elem)) + def read(addr: UInt, ren: Bool): Data = mem.read(addr, ren) + def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = mem.write(addr, wdata, wmask) + (read _, write _) + } val ren = WireInit(false.B) - val raddr = WireInit(getBankIdx(reads(0).bits)) + val raddr = WireInit(getBankIdx(rmw_req.bits)) val nEntries = 3 + // Writes coming 2 cycles after read leads to bad bank behavior // Add another buffer here class W_Q_Entry[T <: Data](mask_len: Int, mask_elem: T) extends Bundle { @@ -126,25 +205,32 @@ class AccumulatorMem[T <: Data, U <: Data]( val addr = UInt(log2Ceil(n/acc_sub_banks).W) override def cloneType: this.type = new W_Q_Entry(mask_len, mask_elem).asInstanceOf[this.type] } + val w_q = Reg(Vec(nEntries, new W_Q_Entry(mask_len, mask_elem))) for (e <- w_q) { when (e.valid) { assert(!( - io.write.valid && io.write.bits.acc && + io.write.fire() && io.write.bits.acc && isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr && ((io.write.bits.mask.asUInt & e.mask.asUInt) =/= 0.U) - )) + ), "you cannot accumulate to an AccumulatorMem address until previous writes to that address have completed") + + when (io.write.bits.acc && isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr) { + rmw_req.ready := false.B + } - when (io.read.req.valid && isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) { - reads(1).ready := false.B + when (isThisBank(io.read.req.bits.addr) && getBankIdx(io.read.req.bits.addr) === e.addr) { + only_read_req.ready := false.B } } } + val w_q_head = RegInit(1.U(nEntries.W)) val w_q_tail = RegInit(1.U(nEntries.W)) - when (reset.asBool) { - w_q.foreach(_.valid := false.B) - } + + val w_q_full = (w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_) + val w_q_empty = !(w_q_head.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_) + val wen = WireInit(false.B) val wdata = Mux1H(w_q_head.asBools, w_q.map(_.data)) val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask)) @@ -158,49 +244,61 @@ class AccumulatorMem[T <: Data, U <: Data]( } } - when (w_buf_valid && isThisBank(waddr_buf)) { - assert(!((w_q_tail.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_))) + val w_q_push = oldest_pipelined_write.valid && isThisBank(oldest_pipelined_write.bits.addr) + + when (w_q_push) { + assert(!w_q_full || wen, "we ran out of acc-sub-bank write q entries") + w_q_tail := (w_q_tail << 1).asUInt() | w_q_tail(nEntries-1) for (i <- 0 until nEntries) { when (w_q_tail(i)) { w_q(i).valid := true.B - w_q(i).data := Mux(acc_buf, w_sum, wdata_buf).asTypeOf(Vec(mask_len, mask_elem)) - w_q(i).mask := mask_buf - w_q(i).addr := getBankIdx(waddr_buf) + w_q(i).data := Mux(oldest_pipelined_write.bits.acc, adder_sum, oldest_pipelined_write.bits.data).asTypeOf(Vec(mask_len, mask_elem)) + w_q(i).mask := oldest_pipelined_write.bits.mask + w_q(i).addr := getBankIdx(oldest_pipelined_write.bits.addr) } } - } - val bank_rdata = mem.read(raddr, ren && !wen).asTypeOf(t) - when (RegNext(ren && reads(0).valid && isThisBank(reads(0).bits))) { - acc_rdata := bank_rdata + + val bank_rdata = read(raddr, ren && !wen).asTypeOf(t) + when (RegNext(ren && rmw_req.valid && isThisBank(rmw_req.bits))) { + rdata_for_adder := bank_rdata } .elsewhen (RegNext(ren)) { - read_rdata := bank_rdata + rdata_for_read_resp := bank_rdata } + when (wen) { - mem.write(waddr, wdata, wmask) + write(waddr, wdata, wmask) } + // Three requestors, 1 slot - // Priority is incoming reads for RMW > writes from RMW > incoming reads - when (reads(0).valid && isThisBank(reads(0).bits)) { + // Priority is (in descending order): + // 1. incoming reads for RMW + // 2. writes from RMW + // 3. incoming reads + when (rmw_req.fire() && isThisBank(rmw_req.bits)) { ren := true.B - when (isThisBank(reads(1).bits)) { - reads(1).ready := false.B + when (isThisBank(only_read_req.bits)) { + only_read_req.ready := false.B } - } .elsewhen ((w_q_head.asBools zip w_q.map(_.valid)).map({ case (h,v) => h && v }).reduce(_||_)) { + } .elsewhen (!w_q_empty) { wen := true.B - when (isThisBank(reads(1).bits)) { - reads(1).ready := false.B + when (isThisBank(only_read_req.bits)) { + only_read_req.ready := false.B } } .otherwise { - ren := isThisBank(reads(1).bits) - raddr := getBankIdx(reads(1).bits) + ren := isThisBank(only_read_req.bits) && only_read_req.fire() + raddr := getBankIdx(only_read_req.bits) + } + + when (reset.asBool) { + w_q.foreach(_.valid := false.B) } } } val q = Module(new Queue(new AccumulatorReadResp(t, scale_t, log2Ceil(t.head.head.getWidth)), 1, true, true)) - q.io.enq.bits.data := read_rdata + q.io.enq.bits.data := rdata_for_read_resp q.io.enq.bits.scale := RegNext(io.read.req.bits.scale) q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift) q.io.enq.bits.act := RegNext(io.read.req.bits.act) @@ -222,17 +320,18 @@ class AccumulatorMem[T <: Data, U <: Data]( val q_will_be_empty = (q.io.count +& q.io.enq.fire()) - q.io.deq.fire() === 0.U io.read.req.ready := q_will_be_empty && ( // Make sure we aren't accumulating, which would take over both ports - !(io.write.fire() && io.write.bits.acc) && - // Make sure we aren't reading something that is still being written - !(RegNext(io.write.fire()) && RegNext(io.write.bits.addr) === io.read.req.bits.addr) && - !(w_buf_valid && waddr_buf === io.read.req.bits.addr) && + !(io.write.valid && io.write.bits.acc) && + !pipelined_writes.map(r => r.valid && r.bits.addr === io.read.req.bits.addr).reduce(_||_) && !block_read_req ) - io.write.ready := !io.write.bits.acc || (!(io.write.bits.addr === waddr_buf && w_buf_valid) && - !(io.write.bits.addr === RegNext(io.write.bits.addr) && RegNext(io.write.fire()))) + io.write.ready := !block_write_req && + !pipelined_writes.map(r => r.valid && r.bits.addr === io.write.bits.addr && io.write.bits.acc).reduce(_||_) + + when (reset.asBool()) { + pipelined_writes.foreach(_.valid := false.B) + } // assert(!(io.read.req.valid && io.write.en && io.write.acc), "reading and accumulating simultaneously is not supported") assert(!(io.read.req.fire() && io.write.fire() && io.read.req.bits.addr === io.write.bits.addr), "reading from and writing to same address is not supported") - assert(!(io.read.req.fire() && w_buf_valid && waddr_buf === io.read.req.bits.addr), "reading from an address immediately after writing to it is not supported") } diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 2e172adf..7ceffcfe 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -5,7 +5,12 @@ import chisel3._ import freechips.rocketchip.config.{Config, Parameters} import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem._ -import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet} +import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet, XLen} +import freechips.rocketchip.rocket._ +import freechips.rocketchip.tile._ +import freechips.rocketchip.system._ +import freechips.rocketchip.diplomacy._ + import gemmini.Arithmetic.SIntArithmetic import hardfloat._ @@ -162,8 +167,10 @@ object GemminiConfigs { acc_scale_args=Some(defaultConfig.acc_scale_args.get.copy(latency=4)), acc_singleported=true, acc_sub_banks=2, + mesh_output_delay = 2, ex_read_from_acc=false, - ex_write_to_spad=false + ex_write_to_spad=false, + hardcode_d_to_garbage_addr = true ) val largeChipConfig = chipConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64), @@ -190,3 +197,75 @@ class DefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( ) case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) + +// This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing +// the same scratchpad. +class DualGemminiConfig extends Config((site, here, up) => { + case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) + case BuildRoCC => { + var int_gemmini: Gemmini[_,_,_] = null + var fp_gemmini: Gemmini[_,_,_] = null + val int_fn = (p: Parameters) => { + implicit val q = p + int_gemmini = LazyModule(new Gemmini(GemminiConfigs.chipConfig.copy( + opcodes = OpcodeSet.custom3, + use_shared_ext_mem = true, + clock_gate = true + ))) + int_gemmini + } + val fp_fn = (p: Parameters) => { + implicit val q = p + fp_gemmini = LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig.copy( + opcodes = OpcodeSet.custom2, + sp_capacity=CapacityInKilobytes(64), acc_capacity=CapacityInKilobytes(32), + meshColumns = 8, meshRows = 8, + acc_singleported = true, acc_banks = 2, acc_sub_banks = 2, + use_shared_ext_mem = true, + ex_read_from_acc=false, + ex_write_to_spad=false, + hardcode_d_to_garbage_addr = true, + headerFileName = "gemmini_params_bf16.h", + acc_latency = 3, + dataflow = Dataflow.WS, + mesh_output_delay = 3, + clock_gate = true + ))) + InModuleBody { + require(int_gemmini.config.sp_banks == fp_gemmini.config.sp_banks) + require(int_gemmini.config.acc_banks == fp_gemmini.config.acc_banks) + require(int_gemmini.config.acc_sub_banks == fp_gemmini.config.acc_sub_banks) + require(int_gemmini.config.sp_singleported && fp_gemmini.config.sp_singleported) + require(int_gemmini.config.acc_singleported && fp_gemmini.config.acc_singleported) + + require(int_gemmini.config.sp_bank_entries == fp_gemmini.config.sp_bank_entries) + require(int_gemmini.spad.module.spad_mems(0).mask_len == fp_gemmini.spad.module.spad_mems(0).mask_len) + require(int_gemmini.spad.module.spad_mems(0).mask_elem.getWidth == fp_gemmini.spad.module.spad_mems(0).mask_elem.getWidth) + + println(int_gemmini.config.acc_bank_entries, fp_gemmini.config.acc_bank_entries) + println(int_gemmini.spad.module.acc_mems(0).mask_len, fp_gemmini.spad.module.acc_mems(0).mask_len) + println(int_gemmini.spad.module.acc_mems(0).mask_elem.getWidth, fp_gemmini.spad.module.acc_mems(0).mask_elem.getWidth) + + require(int_gemmini.config.acc_bank_entries == fp_gemmini.config.acc_bank_entries / 2) + require(int_gemmini.config.acc_sub_banks == fp_gemmini.config.acc_sub_banks) + require(int_gemmini.spad.module.acc_mems(0).mask_len == fp_gemmini.spad.module.acc_mems(0).mask_len * 2) + require(int_gemmini.spad.module.acc_mems(0).mask_elem.getWidth == fp_gemmini.spad.module.acc_mems(0).mask_elem.getWidth) + + val spad_mask_len = int_gemmini.spad.module.spad_mems(0).mask_len + val spad_data_len = int_gemmini.spad.module.spad_mems(0).mask_elem.getWidth + val acc_mask_len = int_gemmini.spad.module.acc_mems(0).mask_len + val acc_data_len = int_gemmini.spad.module.acc_mems(0).mask_elem.getWidth + + val shared_mem = Module(new SharedExtMem( + int_gemmini.config.sp_banks, int_gemmini.config.acc_banks, int_gemmini.config.acc_sub_banks, + int_gemmini.config.sp_bank_entries, spad_mask_len, spad_data_len, + int_gemmini.config.acc_bank_entries / int_gemmini.config.acc_sub_banks, acc_mask_len, acc_data_len + )) + shared_mem.io.in(0) <> int_gemmini.module.ext_mem_io.get + shared_mem.io.in(1) <> fp_gemmini.module.ext_mem_io.get + } + fp_gemmini + } + up(BuildRoCC) ++ Seq(int_fn, fp_fn) + } +}) diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index a54c2853..91a4dbd2 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -30,6 +30,7 @@ object GemminiFPConfigs { sp_banks = 4, sp_singleported = true, acc_banks = 1, + acc_latency = 2, acc_singleported = false, acc_sub_banks = -1, sp_capacity = CapacityInKilobytes(256), @@ -45,7 +46,7 @@ object GemminiFPConfigs { use_tlb_register_filter = true, max_in_flight_mem_reqs = 16, use_dedicated_tl_port = false, - + use_shared_ext_mem = false, inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24), @@ -84,7 +85,7 @@ object GemminiFPConfigs { mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) - + //FP16 Half Precision Configuration val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24), pe_latency = 2, diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index f1de9486..3e74af93 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -9,6 +9,7 @@ import chisel3.util._ import freechips.rocketchip.config._ import freechips.rocketchip.diplomacy._ import freechips.rocketchip.tile._ +import freechips.rocketchip.util.ClockGate import freechips.rocketchip.tilelink.TLIdentityNode import GemminiISA._ import Util._ @@ -49,6 +50,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] import outer.config._ import outer.spad + val ext_mem_io = if (use_shared_ext_mem) Some(IO(new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks))) else None + ext_mem_io.foreach(_ <> outer.spad.module.io.ext_mem.get) + val tagWidth = 32 // Counters @@ -74,6 +78,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] spad.module.io.flush := tlb.io.exp.map(_.flush()).reduce(_ || _) + val clock_en_reg = RegInit(true.B) + val gated_clock = if (clock_gate) ClockGate(clock, clock_en_reg, "gemmini_clock_gate") else clock + outer.spad.module.clock := gated_clock + /* //========================================================================= // Frontends: Incoming commands and ROB @@ -113,10 +121,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val unrolled_cmd = LoopUnroller(raw_risc_cmd, outer.config.meshRows * outer.config.tileRows) */ - // Incoming commands and reservation station - val reservation_station = Module(new ReservationStation(outer.config, new RoCCCommand)) + val reservation_station = withClock (gated_clock) { Module(new ReservationStation(outer.config, new RoCCCommand)) } counters.io.event_io.collect(reservation_station.io.counter) + when (io.cmd.valid && io.cmd.bits.inst.funct === CLKGATE_EN && !io.busy) { + clock_en_reg := io.cmd.bits.rs1(0) + } val raw_cmd = Queue(io.cmd) val max_lds = reservation_station_partial_entries @@ -124,7 +134,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val max_sts = reservation_station_partial_entries / 2 // TODO replace 4,12,2 with parameters based on ROB size - val (conv_cmd, loop_conv_unroller_busy) = LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, + val (conv_cmd, loop_conv_unroller_busy) = withClock (gated_clock) { LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), @@ -132,14 +142,14 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), - has_training_convs, has_max_pool) + has_training_convs, has_max_pool) } - val (loop_cmd, loop_matmul_unroller_busy) = LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, + val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), - new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)) + new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)) } val unrolled_cmd = Queue(loop_cmd) unrolled_cmd.ready := false.B @@ -167,9 +177,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] //========================================================================= // Controllers //========================================================================= - val load_controller = Module(new LoadController(outer.config, coreMaxAddrBits, local_addr_t)) - val store_controller = Module(new StoreController(outer.config, coreMaxAddrBits, local_addr_t)) - val ex_controller = Module(new ExecuteController(xLen, tagWidth, outer.config)) + val load_controller = withClock (gated_clock) { Module(new LoadController(outer.config, coreMaxAddrBits, local_addr_t)) } + val store_controller = withClock (gated_clock) { Module(new StoreController(outer.config, coreMaxAddrBits, local_addr_t)) } + val ex_controller = withClock (gated_clock) { Module(new ExecuteController(xLen, tagWidth, outer.config)) } counters.io.event_io.collect(load_controller.io.counter) counters.io.event_io.collect(store_controller.io.counter) @@ -240,7 +250,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] ex_controller.io.acc.write <> spad.module.io.acc.write // Im2Col unit - val im2col = Module(new Im2Col(outer.config)) + val im2col = withClock (gated_clock) { Module(new Im2Col(outer.config)) } // Wire up Im2col counters.io.event_io.collect(im2col.io.counter) @@ -313,6 +323,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // Wire up global RoCC signals io.busy := raw_cmd.valid || loop_conv_unroller_busy || loop_matmul_unroller_busy || reservation_station.io.busy || spad.module.io.busy || unrolled_cmd.valid || loop_cmd.valid || conv_cmd.valid + io.interrupt := tlb.io.exp.map(_.interrupt).reduce(_ || _) reservation_station.io.solitary_preload := ex_controller.io.solitary_preload @@ -349,6 +360,8 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val is_flush = risc_funct === FLUSH_CMD val is_counter_op = risc_funct === COUNTER_OP + val is_clock_gate_en = risc_funct === CLKGATE_EN + /* val is_load = (funct === LOAD_CMD) || (funct === CONFIG_CMD && config_cmd_type === CONFIG_LOAD) val is_store = (funct === STORE_CMD) || (funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE) @@ -369,6 +382,10 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] counters.io.in <> unrolled_cmd } + .elsewhen (is_clock_gate_en) { + unrolled_cmd.ready := true.B + } + .otherwise { reservation_station.io.alloc.valid := true.B diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index 37fc70f4..0d4681b5 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -27,7 +27,7 @@ object DSEBaseConfig { sp_banks = 4, // TODO support one-bank designs acc_banks = 1, acc_singleported = false, - acc_sub_banks = -1, + acc_latency = 2, sp_capacity = CapacityInKilobytes(64), sp_singleported = false, shifter_banks = 1, // TODO add separate parameters for left and up shifter banks @@ -59,6 +59,7 @@ object DSEBaseConfig { acc_read_full_width = true, acc_read_small_width = true, use_dedicated_tl_port = false, + use_shared_ext_mem = true, pe_latency = 0, ex_read_from_spad = true, @@ -79,6 +80,8 @@ object DSEBaseConfig { has_nonlinear_activations = true, num_counter = 8, + + clock_gate = false, ) } diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 041bfcd0..beb46c71 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -15,12 +15,12 @@ case class ScaleArguments[T <: Data, U <: Data](scale_func: (T, U) => T, latency identity: String="0", c_str: String="ROUNDING_RIGHT_SHIFT(x, scale)") case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + opcodes: OpcodeSet = OpcodeSet.custom3, + inputType: T, spatialArrayOutputType: T, accType: T, - opcodes: OpcodeSet = OpcodeSet.custom3, - dataflow: Dataflow.Value = Dataflow.BOTH, tileRows: Int = 1, @@ -44,6 +44,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( acc_singleported: Boolean = false, acc_sub_banks: Int = -1, acc_capacity: GemminiMemCapacity = CapacityInKilobytes(64), + acc_latency: Int = 2, dma_maxbytes: Int = 64, // TODO get this from cacheblockbytes dma_buswidth: Int = 128, // TODO get this from SystemBusKey @@ -85,6 +86,9 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( use_firesim_simulation_counters: Boolean = false, + use_shared_ext_mem: Boolean = false, + clock_gate: Boolean = false, + headerFileName: String = "gemmini_params.h" ) { val sp_width = meshColumns * tileColumns * inputType.getWidth @@ -261,7 +265,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( (dt.expWidth, dt.sigWidth) match { case (8, 24) => (scala.Float.MinValue.toString, scala.Float.MaxValue.toString) case (11, 53) => (scala.Double.MinValue.toString, scala.Double.MaxValue.toString) - case _ => (((Range(-1,-(dt.sigWidth),-1).map(-Math.pow(2, _)).foldLeft(-1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString, ((Range(-1,-(dt.sigWidth),-1).map(Math.pow(2, _)).foldLeft(1.0)(_ + _)) * Math.pow(2, Math.pow(2, dt.expWidth - 1) - 1)).toString) + case (e, s) => (((Range(-1,-(s),-1).map(-Math.pow(2, _)).foldLeft(-1.0)(_ + _)) * Math.pow(2, Math.pow(2, e - 1) - 1)).toString, ((Range(-1,-(s),-1).map(Math.pow(2, _)).foldLeft(1.0)(_ + _)) * Math.pow(2, Math.pow(2, e - 1) - 1)).toString) } case dt => ("0", BigInt(2).pow(dt.getWidth).-(1).toString) // case _ => throw new IllegalArgumentException(s"Data type $dataType is unknown") @@ -275,7 +279,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( (dt.expWidth, dt.sigWidth) match { case (8, 24) => "float" case (11, 53) => "double" - case _ => s"uint" + (Math.pow(2, Math.ceil(Math.log(dt.expWidth + dt.sigWidth)/Math.log(2.0)))).toInt.toString + s"_t" + case (e, s) => s"uint" + (Math.pow(2, Math.ceil(Math.log(e + s)/Math.log(2.0)))).toInt.toString + s"_t" } case dt => s"uint${dt.getWidth}_t" } diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala index 554bcdeb..c85b6816 100644 --- a/src/main/scala/gemmini/GemminiISA.scala +++ b/src/main/scala/gemmini/GemminiISA.scala @@ -32,6 +32,8 @@ object GemminiISA { val LOOP_CONV_WS_CONFIG_5 = 20.U // *weights | *output val LOOP_CONV_WS_CONFIG_6 = 21.U // *bias, *input + val CLKGATE_EN = 22.U + // rs1[2:0] values val CONFIG_EX = 0.U val CONFIG_LOAD = 1.U diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala index b003fd7b..cce6bcae 100644 --- a/src/main/scala/gemmini/LocalAddr.scala +++ b/src/main/scala/gemmini/LocalAddr.scala @@ -81,3 +81,13 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en override def cloneType: LocalAddr.this.type = new LocalAddr(sp_banks, sp_bank_entries, acc_banks, acc_bank_entries).asInstanceOf[this.type] } + +object LocalAddr { + def cast_to_local_addr[T <: Data](local_addr_t: LocalAddr, t: T): LocalAddr = { + // This convenience function is basically the same as calling "asTypeOf(local_addr_t)". However, this convenience + // function will also cast unnecessary garbage bits to 0, which may help reduce multiplier/adder bitwidths + val result = WireInit(t.asTypeOf(local_addr_t)) + if (result.garbage_bit.getWidth > 0) result.garbage := 0.U + result + } +} diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 47cd5a39..1f27f3ff 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -6,6 +6,7 @@ import chisel3.experimental._ import freechips.rocketchip.tile.RoCCCommand import freechips.rocketchip.config.Parameters import GemminiISA._ +import LocalAddr.cast_to_local_addr import Util._ class LoopConvOuterBounds(val large_iterator_bitwidth: Int, val small_iterator_bitwidth: Int, val tiny_iterator_bitwidth: Int) extends Bundle { @@ -172,7 +173,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi mvin_cmd_rs2 := DontCare mvin_cmd_rs2.num_rows := o.I.asUInt() mvin_cmd_rs2.num_cols := o.J.asUInt() - mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, o.spad_addr) io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() } @@ -343,7 +344,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw mvin_cmd_rs2 := DontCare mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt() mvin_cmd_rs2.num_cols := o.K.asUInt() - mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, o.spad_addr) io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() } @@ -388,7 +389,7 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: val outer_bounds = new LoopConvOuterBounds(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth) val inner_bounds = new LoopConvInnerBounds(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth) val derived_params = new LoopConvDerivedParams(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth) - val addr_end = UInt(log2Up(max_addr).W) + val addr_end = UInt(log2Up(max_addr+1).W) val dram_addr = UInt(coreMaxAddrBits.W) val trans_weight_1203 = Bool() val trans_weight_0132 = Bool() @@ -513,7 +514,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit mvin_cmd_rs2 := DontCare mvin_cmd_rs2.num_rows := o.K mvin_cmd_rs2.num_cols := o.J - mvin_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, o.spad_addr) io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() } @@ -556,7 +557,7 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi val inner_bounds = new LoopConvInnerBounds(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth) val derived_params = new LoopConvDerivedParams(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth) val a_addr_start = UInt(log2Up(max_addr).W) - val b_addr_end = UInt(log2Up(max_addr).W) + val b_addr_end = UInt(log2Up(max_addr+1).W) val c_addr_start = UInt(log2Up(max_acc_addr).W) val wrot180 = Bool() val downsample = Bool() @@ -719,13 +720,13 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera pre_cmd_rs1 := DontCare pre_cmd_rs1.num_rows := o.K.asUInt() pre_cmd_rs1.num_cols := o.J.asUInt() - pre_cmd_rs1.local_addr := o.pre_addr.asTypeOf(pre_cmd_rs1.local_addr) + pre_cmd_rs1.local_addr := cast_to_local_addr(pre_cmd_rs1.local_addr, o.pre_addr) val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType) pre_cmd_rs2 := DontCare pre_cmd_rs2.num_rows := o.I.asUInt() pre_cmd_rs2.num_cols := o.J.asUInt() - pre_cmd_rs2.local_addr := o.c_addr.asTypeOf(pre_cmd_rs2.local_addr) + pre_cmd_rs2.local_addr := cast_to_local_addr(pre_cmd_rs2.local_addr, o.c_addr) io.cmd.bits.rs1 := pre_cmd_rs1.asUInt() io.cmd.bits.rs2 := pre_cmd_rs2.asUInt() @@ -735,13 +736,13 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera comp_cmd_rs1 := DontCare comp_cmd_rs1.num_rows := o.I.asUInt() comp_cmd_rs1.num_cols := o.K.asUInt() - comp_cmd_rs1.local_addr := o.a_addr.asTypeOf(comp_cmd_rs1.local_addr) + comp_cmd_rs1.local_addr := cast_to_local_addr(comp_cmd_rs1.local_addr, o.a_addr) val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType) comp_cmd_rs2 := DontCare comp_cmd_rs2.num_rows := o.I.asUInt() comp_cmd_rs2.num_cols := o.J.asUInt() - comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr) + comp_cmd_rs2.local_addr := cast_to_local_addr(comp_cmd_rs2.local_addr, GARBAGE_ADDR) io.cmd.bits.rs1 := comp_cmd_rs1.asUInt() io.cmd.bits.rs2 := comp_cmd_rs2.asUInt() @@ -967,7 +968,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val pool_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) pool_mvout_cmd_rs2 := DontCare pool_mvout_cmd_rs2.num_cols := o.channels - pool_mvout_cmd_rs2.local_addr := o.pool_spad_addr.asTypeOf(pool_mvout_cmd_rs2.local_addr) + pool_mvout_cmd_rs2.local_addr := cast_to_local_addr(pool_mvout_cmd_rs2.local_addr, o.pool_spad_addr) io.cmd.bits.rs1 := o.pool_dram_addr io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt() @@ -976,7 +977,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: mvout_cmd_rs2 := DontCare mvout_cmd_rs2.num_rows := o.I.asUInt() mvout_cmd_rs2.num_cols := o.J.asUInt() - mvout_cmd_rs2.local_addr := o.spad_addr.asTypeOf(mvout_cmd_rs2.local_addr) + mvout_cmd_rs2.local_addr := cast_to_local_addr(mvout_cmd_rs2.local_addr, o.spad_addr) io.cmd.bits.rs1 := o.dram_addr io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt() @@ -1067,7 +1068,7 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s def all_completed(dummy: Int=0): Bool = ld_bias_completed && ld_input_completed && ld_weights_completed && ex_completed && st_completed val a_addr_start = UInt(log2Up(max_addr).W) - val b_addr_end = UInt(log2Up(max_addr).W) + val b_addr_end = UInt(log2Up(max_addr+1).W) def derived_params(dummy: Int=0): LoopConvDerivedParams = { import outer_bounds.{stride, kernel_dilation} @@ -1453,7 +1454,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I loops.zipWithIndex.foreach { case (l, i) => l.reset() l.a_addr_start := (i * (max_addr / concurrent_loops)).U - l.b_addr_end := ((i+1) * (max_addr / concurrent_loops) - block_size).U + l.b_addr_end := ((i+1) * (max_addr / concurrent_loops)).U } } } diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index ea1c3ed6..791b43d5 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -6,6 +6,7 @@ import chisel3.experimental._ import freechips.rocketchip.tile.RoCCCommand import freechips.rocketchip.config.Parameters import GemminiISA._ +import LocalAddr.cast_to_local_addr import Util._ // LdA @@ -75,7 +76,7 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd_rs2 := DontCare mvin_cmd_rs2.num_rows := rows.asUInt() mvin_cmd_rs2.num_cols := cols.asUInt() - mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, sp_addr) mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() io.req.ready := state === idle @@ -122,7 +123,7 @@ class LoopMatmulLdBReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat val dram_addr = UInt(coreMaxAddrBits.W) val dram_stride = UInt(coreMaxAddrBits.W) val transpose = Bool() - val addr_end = UInt(log2Up(max_addr).W) + val addr_end = UInt(log2Up(max_addr+1).W) val loop_id = UInt(log2Up(concurrent_loops).W) } @@ -182,7 +183,7 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd_rs2 := DontCare mvin_cmd_rs2.num_rows := rows.asUInt() mvin_cmd_rs2.num_cols := cols.asUInt() - mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, sp_addr) mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() io.req.ready := state === idle @@ -278,7 +279,7 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd_rs2 := DontCare mvin_cmd_rs2.num_rows := rows.asUInt() mvin_cmd_rs2.num_cols := cols.asUInt() - mvin_cmd_rs2.local_addr := sp_addr.asTypeOf(mvin_cmd_rs2.local_addr) + mvin_cmd_rs2.local_addr := cast_to_local_addr(mvin_cmd_rs2.local_addr, sp_addr) mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() io.req.ready := state === idle @@ -325,7 +326,7 @@ class LoopMatmulExecuteReq(val block_size: Int, val coreMaxAddrBits: Int, val it val b_tranpose = Bool() val accumulate = Bool() val a_addr_start = UInt(log2Up(max_addr).W) - val b_addr_end = UInt(log2Up(max_addr).W) + val b_addr_end = UInt(log2Up(max_addr+1).W) val c_addr_start = UInt(log2Up(max_acc_addr).W) val loop_id = UInt(log2Up(concurrent_loops).W) } @@ -405,13 +406,13 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth pre_cmd_rs1 := DontCare pre_cmd_rs1.num_rows := b_rows.asUInt() pre_cmd_rs1.num_cols := b_cols.asUInt() - pre_cmd_rs1.local_addr := pre_addr.asTypeOf(pre_cmd_rs1.local_addr) + pre_cmd_rs1.local_addr := cast_to_local_addr(pre_cmd_rs1.local_addr, pre_addr) val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType) pre_cmd_rs2 := DontCare pre_cmd_rs2.num_rows := c_rows.asUInt() pre_cmd_rs2.num_cols := c_cols.asUInt() - pre_cmd_rs2.local_addr := out_addr.asTypeOf(pre_cmd_rs2.local_addr) + pre_cmd_rs2.local_addr := cast_to_local_addr(pre_cmd_rs2.local_addr, out_addr) pre_cmd.rs1 := pre_cmd_rs1.asUInt() pre_cmd.rs2 := pre_cmd_rs2.asUInt() @@ -424,13 +425,13 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth comp_cmd_rs1 := DontCare comp_cmd_rs1.num_rows := a_rows.asUInt() comp_cmd_rs1.num_cols := a_cols.asUInt() - comp_cmd_rs1.local_addr := a_addr.asTypeOf(comp_cmd_rs1.local_addr) + comp_cmd_rs1.local_addr := cast_to_local_addr(comp_cmd_rs1.local_addr, a_addr) val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType) comp_cmd_rs2 := DontCare comp_cmd_rs2.num_rows := block_size.U comp_cmd_rs2.num_cols := block_size.U - comp_cmd_rs2.local_addr := GARBAGE_ADDR.asTypeOf(comp_cmd_rs2.local_addr) + comp_cmd_rs2.local_addr := cast_to_local_addr(comp_cmd_rs2.local_addr, GARBAGE_ADDR) comp_cmd.rs1 := comp_cmd_rs1.asUInt() comp_cmd.rs2 := comp_cmd_rs2.asUInt() @@ -545,7 +546,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvout_cmd_rs2 := DontCare mvout_cmd_rs2.num_rows := rows.asUInt() mvout_cmd_rs2.num_cols := cols.asUInt() - mvout_cmd_rs2.local_addr := sp_addr.asTypeOf(mvout_cmd_rs2.local_addr) + mvout_cmd_rs2.local_addr := cast_to_local_addr(mvout_cmd_rs2.local_addr, sp_addr) mvout_cmd.rs2 := mvout_cmd_rs2.asUInt() io.req.ready := state === idle @@ -636,7 +637,7 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val def all_completed(dummy: Int=0): Bool = lda_completed && ldb_completed && ldd_completed && ex_completed && st_completed val a_addr_start = UInt(log2Up(max_addr).W) - val b_addr_end = UInt(log2Up(max_addr).W) + val b_addr_end = UInt(log2Up(max_addr+1).W) def reset(): Unit = { configured := false.B @@ -958,7 +959,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: loops.zipWithIndex.foreach { case (l, i) => l.reset() l.a_addr_start := (i * (max_addr / concurrent_loops)).U - l.b_addr_end := ((i+1) * (max_addr / concurrent_loops) - block_size).U + l.b_addr_end := ((i+1) * (max_addr / concurrent_loops)).U } } } diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index e3289b7f..0d76758d 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -6,12 +6,11 @@ import freechips.rocketchip.config.Parameters import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ -import freechips.rocketchip.tilelink.{TLIdentityNode, TLXbar} +import freechips.rocketchip.tilelink.{TLIdentityNode, TLXbar, TLBuffer} import Util._ -class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int) - (implicit p: Parameters) extends CoreBundle { +class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle { val vaddr = UInt(coreMaxAddrBits.W) val laddr = local_addr_t.cloneType @@ -57,15 +56,13 @@ class ScratchpadMemReadResponse extends Bundle { val cmd_id = UInt(8.W) // TODO don't use a magic number here } -class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int) - (implicit p: Parameters) extends CoreBundle { +class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle { val req = Decoupled(new ScratchpadMemReadRequest(local_addr_t, scale_t_bits)) val resp = Flipped(Valid(new ScratchpadMemReadResponse)) override def cloneType: this.type = new ScratchpadReadMemIO(local_addr_t, scale_t_bits).asInstanceOf[this.type] } -// class ScratchpadWriteMemIO(val nBanks: Int, val nRows: Int, val acc_rows: Int) class ScratchpadWriteMemIO(local_addr_t: LocalAddr, scale_t_bits: Int) (implicit p: Parameters) extends CoreBundle { val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, scale_t_bits)) @@ -96,7 +93,7 @@ class ScratchpadWriteIO(val n: Int, val w: Int, val mask_len: Int) extends Bundl val data = Output(UInt(w.W)) } -class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean) extends Module { +class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean, use_shared_ext_mem: Boolean) extends Module { // This is essentially a pipelined SRAM with the ability to stall pipeline stages require(w % aligned_to == 0 || w < aligned_to) @@ -106,27 +103,50 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean) ex val io = IO(new Bundle { val read = Flipped(new ScratchpadReadIO(n, w)) val write = Flipped(new ScratchpadWriteIO(n, w, mask_len)) + val ext_mem = if (use_shared_ext_mem) Some(new ExtMemIO) else None }) - val mem = SyncReadMem(n, Vec(mask_len, mask_elem)) + val (read, write) = if (use_shared_ext_mem) { + def read(addr: UInt, ren: Bool): Data = { + io.ext_mem.get.read_en := ren + io.ext_mem.get.read_addr := addr + io.ext_mem.get.read_data + } + io.ext_mem.get.write_en := false.B + io.ext_mem.get.write_addr := DontCare + io.ext_mem.get.write_data := DontCare + io.ext_mem.get.write_mask := DontCare + def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = { + io.ext_mem.get.write_en := true.B + io.ext_mem.get.write_addr := addr + io.ext_mem.get.write_data := wdata.asUInt + io.ext_mem.get.write_mask := wmask.asUInt + } + (read _, write _) + } else { + val mem = SyncReadMem(n, Vec(mask_len, mask_elem)) + def read(addr: UInt, ren: Bool): Data = mem.read(addr, ren) + def write(addr: UInt, wdata: Vec[UInt], wmask: Vec[Bool]) = mem.write(addr, wdata, wmask) + (read _, write _) + } // When the scratchpad is single-ported, the writes take precedence val singleport_busy_with_write = single_ported.B && io.write.en when (io.write.en) { if (aligned_to >= w) - mem.write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem))) + write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)), VecInit((~(0.U(mask_len.W))).asBools)) else - mem.write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)), io.write.mask) + write(io.write.addr, io.write.data.asTypeOf(Vec(mask_len, mask_elem)), io.write.mask) } val raddr = io.read.req.bits.addr val ren = io.read.req.fire() val rdata = if (single_ported) { assert(!(ren && io.write.en)) - mem.read(raddr, ren && !io.write.en).asUInt() + read(raddr, ren && !io.write.en).asUInt() } else { - mem.read(raddr, ren).asUInt() + read(raddr, ren).asUInt() } val fromDMA = io.read.req.bits.fromDMA @@ -143,6 +163,7 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean) ex io.read.resp <> q.io.deq } + class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V]) (implicit p: Parameters, ev: Arithmetic[T]) extends LazyModule { @@ -171,9 +192,9 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // id_node :=* reader.node // id_node :=* writer.node - xbar_node := reader.node // TODO - xbar_node := writer.node - id_node := xbar_node + xbar_node := TLBuffer() := reader.node // TODO + xbar_node := TLBuffer() := writer.node + id_node := TLBuffer() := xbar_node lazy val module = new LazyModuleImp(this) with HasCoreParameters { @@ -204,6 +225,12 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, )))) } + val ext_mem = if (use_shared_ext_mem) { + Some(new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks)) + } else { + None + } + // TLB ports val tlb = Vec(2, new FrontendTLBIO) @@ -368,12 +395,19 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid - { - val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank(sp_bank_entries, spad_w, aligned_to, config.sp_singleported)) } + val spad_mems = { + val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank( + sp_bank_entries, spad_w, + aligned_to, config.sp_singleported, + use_shared_ext_mem + )) } val bank_ios = VecInit(banks.map(_.io)) - // Reading from the SRAM banks bank_ios.zipWithIndex.foreach { case (bio, i) => + if (use_shared_ext_mem) { + io.ext_mem.get.spad(i) <> bio.ext_mem.get + } + val ex_read_req = io.srams.read(i).req val exread = ex_read_req.valid @@ -414,7 +448,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val dma_read_pipe = Pipeline(dma_read_resp, spad_read_delay) val ex_read_pipe = Pipeline(ex_read_resp, spad_read_delay) - bio.read.resp.ready := Mux(bio.read.resp.bits.fromDMA, dma_read_resp.ready, ex_read_resp.ready) dma_read_pipe.ready := writer.module.io.req.ready && @@ -472,6 +505,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.write.mask := DontCare } } + banks } val acc_row_t = Vec(meshColumns, Vec(tileColumns, accType)) @@ -513,11 +547,14 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, } } - { + val acc_adders = Module(new AccPipeShared(acc_latency-1, acc_row_t, acc_banks)) + val acc_mems = { val banks = Seq.fill(acc_banks) { Module(new AccumulatorMem( acc_bank_entries, acc_row_t, acc_scale_func, acc_scale_t.asInstanceOf[V], - acc_singleported, acc_sub_banks + acc_singleported, acc_sub_banks, + use_shared_ext_mem, + acc_latency, accType, )) } val bank_ios = VecInit(banks.map(_.io)) @@ -526,6 +563,15 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // Reading from the Accumulator banks bank_ios.zipWithIndex.foreach { case (bio, i) => + if (use_shared_ext_mem) { + io.ext_mem.get.acc(i) <> bio.ext_mem.get + } + + acc_adders.io.in_sel(i) := bio.adder.valid + acc_adders.io.ina(i) := bio.adder.op1 + acc_adders.io.inb(i) := bio.adder.op2 + bio.adder.sum := acc_adders.io.out + val ex_read_req = io.acc.read_req(i) val exread = ex_read_req.valid @@ -677,6 +723,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.write.bits.mask := DontCare } } + banks } // Counter connection diff --git a/src/main/scala/gemmini/SharedExtMem.scala b/src/main/scala/gemmini/SharedExtMem.scala new file mode 100644 index 00000000..9d0e1802 --- /dev/null +++ b/src/main/scala/gemmini/SharedExtMem.scala @@ -0,0 +1,80 @@ +package gemmini + +import chisel3._ +import chisel3.util._ + +import Util._ + + +class ExtMemIO extends Bundle { + val read_en = Output(Bool()) + val read_addr = Output(UInt()) + val read_data = Input(UInt()) + + val write_en = Output(Bool()) + val write_addr = Output(UInt()) + val write_data = Output(UInt()) + val write_mask = Output(UInt()) +} + +class ExtSpadMemIO(sp_banks: Int, acc_banks: Int, acc_sub_banks: Int) extends Bundle { + val spad = Vec(sp_banks, new ExtMemIO) + val acc = Vec(acc_banks, Vec(acc_sub_banks, new ExtMemIO)) + override def cloneType: this.type = new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks).asInstanceOf[this.type] +} + + +class SharedSyncReadMem(nSharers: Int, depth: Int, mask_len: Int, data_len: Int) extends Module { + val io = IO(new Bundle { + val in = Vec(nSharers, Flipped(new ExtMemIO())) + }) + val mem = SyncReadMem(depth, Vec(mask_len, UInt(data_len.W))) + val wens = io.in.map(_.write_en) + val wen = wens.reduce(_||_) + val waddr = Mux1H(wens, io.in.map(_.write_addr)) + val wmask = Mux1H(wens, io.in.map(_.write_mask)) + val wdata = Mux1H(wens, io.in.map(_.write_data)) + assert(PopCount(wens) <= 1.U) + val rens = io.in.map(_.read_en) + assert(PopCount(rens) <= 1.U) + val ren = rens.reduce(_||_) + val raddr = Mux1H(rens, io.in.map(_.read_addr)) + val rdata = mem.read(raddr, ren && !wen) + io.in.foreach(_.read_data := rdata.asUInt) + when (wen) { + mem.write(waddr, wdata.asTypeOf(Vec(mask_len, UInt(data_len.W))), wmask.asTypeOf(Vec(mask_len, Bool()))) + } + +} + +class SharedExtMem( + sp_banks: Int, acc_banks: Int, acc_sub_banks: Int, + sp_depth: Int, sp_mask_len: Int, sp_data_len: Int, + acc_depth: Int, acc_mask_len: Int, acc_data_len: Int +) extends Module { + val nSharers = 2 + val io = IO(new Bundle { + val in = Vec(nSharers, Flipped(new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks))) + }) + for (i <- 0 until sp_banks) { + val spad_mem = Module(new SharedSyncReadMem(nSharers, sp_depth, sp_mask_len, sp_data_len)) + for (w <- 0 until nSharers) { + spad_mem.io.in(w) <> io.in(w).spad(i) + } + } + for (i <- 0 until acc_banks) { + for (s <- 0 until acc_sub_banks) { + val acc_mem = Module(new SharedSyncReadMem(nSharers, acc_depth, acc_mask_len, acc_data_len)) + + acc_mem.io.in(0) <> io.in(0).acc(i)(s) + // The FP gemmini expects a taller, skinnier accumulator mem + acc_mem.io.in(1) <> io.in(1).acc(i)(s) + acc_mem.io.in(1).read_addr := io.in(1).acc(i)(s).read_addr >> 1 + io.in(1).acc(i)(s).read_data := acc_mem.io.in(1).read_data.asTypeOf(Vec(2, UInt((acc_data_len * acc_mask_len / 2).W)))(RegNext(io.in(1).acc(i)(s).read_addr(0))) + + acc_mem.io.in(1).write_addr := io.in(1).acc(i)(s).write_addr >> 1 + acc_mem.io.in(1).write_data := Cat(io.in(1).acc(i)(s).write_data, io.in(1).acc(i)(s).write_data) + acc_mem.io.in(1).write_mask := Mux(io.in(1).acc(i)(s).write_addr(0), io.in(1).acc(i)(s).write_mask << (acc_mask_len / 2), io.in(1).acc(i)(s).write_mask) + } + } +} diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala index d1cefcb3..05480e09 100644 --- a/src/main/scala/gemmini/VectorScalarMultiplier.scala +++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala @@ -120,7 +120,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data]( head_oh := (head_oh << 1) | head_oh(nEntries-1) } in_fire := (in.valid && - (!Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && io.resp.fire())) + (!Mux1H(tail_oh.asBools, regs.map(_.valid))) ) when (in_fire) { for (i <- 0 until nEntries) { @@ -193,7 +193,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data]( } - + } From f0419e7f308884570a61973ea7b483ef5a306bcf Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Thu, 2 Dec 2021 16:39:08 -0800 Subject: [PATCH 07/11] Add option to pick between mac chains and mac trees (#167) Also, make it easier to pipeline reduction trees by pipelining tiles rather than PEs --- src/main/scala/gemmini/Configs.scala | 4 ++- src/main/scala/gemmini/ConfigsFP.scala | 10 +++--- src/main/scala/gemmini/DSEConfigs.scala | 3 +- .../scala/gemmini/ExecuteController.scala | 6 ++-- src/main/scala/gemmini/GemminiConfigs.scala | 7 ++-- src/main/scala/gemmini/Mesh.scala | 33 ++++++++++++------- src/main/scala/gemmini/MeshWithDelays.scala | 29 ++++++++-------- src/main/scala/gemmini/PE.scala | 20 +++++------ src/main/scala/gemmini/Tile.scala | 21 +++++++++--- src/main/scala/gemmini/Util.scala | 16 +++++++++ 10 files changed, 96 insertions(+), 53 deletions(-) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 7ceffcfe..a4094d88 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -174,6 +174,7 @@ object GemminiConfigs { ) val largeChipConfig = chipConfig.copy(sp_capacity=CapacityInKilobytes(128), acc_capacity=CapacityInKilobytes(64), + tileRows=1, tileColumns=1, meshRows=32, meshColumns=32 ) @@ -219,6 +220,7 @@ class DualGemminiConfig extends Config((site, here, up) => { fp_gemmini = LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig.copy( opcodes = OpcodeSet.custom2, sp_capacity=CapacityInKilobytes(64), acc_capacity=CapacityInKilobytes(32), + tileColumns = 1, tileRows = 1, meshColumns = 8, meshRows = 8, acc_singleported = true, acc_banks = 2, acc_sub_banks = 2, use_shared_ext_mem = true, @@ -226,7 +228,7 @@ class DualGemminiConfig extends Config((site, here, up) => { ex_write_to_spad=false, hardcode_d_to_garbage_addr = true, headerFileName = "gemmini_params_bf16.h", - acc_latency = 3, + acc_latency = 3, dataflow = Dataflow.WS, mesh_output_delay = 3, clock_gate = true diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index 91a4dbd2..35ecf821 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -61,7 +61,7 @@ object GemminiFPConfigs { acc_read_full_width = true, acc_read_small_width = true, - pe_latency = 1, + tile_latency = 1, ex_read_from_spad = true, ex_read_from_acc = true, @@ -81,21 +81,21 @@ object GemminiFPConfigs { //FP32 Single Precision Configuration val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24), - pe_latency = 2, + tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) //FP16 Half Precision Configuration val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24), - pe_latency = 2, + tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")), ) //Bfloat16 Brain-half Precision Configuration val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24), - pe_latency = 2, + tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) @@ -103,7 +103,7 @@ object GemminiFPConfigs { //Bfloat16 Brain-half Precision Configuration 8x8 array val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24), meshRows = 8, meshColumns = 8, - pe_latency = 2, + tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index 0d4681b5..f00297e3 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -59,8 +59,9 @@ object DSEBaseConfig { acc_read_full_width = true, acc_read_small_width = true, use_dedicated_tl_port = false, + use_shared_ext_mem = true, - pe_latency = 0, + tile_latency = 0, ex_read_from_spad = true, ex_read_from_acc = true, diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 9d1cf094..6891c09b 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -187,7 +187,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val cntl = mesh_cntl_signals_q.io.deq.bits // Instantiate the actual mesh - val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, pe_latency, mesh_output_delay, + val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, tree_reduction, tile_latency, mesh_output_delay, tileRows, tileColumns, meshRows, meshColumns, shifter_banks, shifter_banks)) mesh.io.a.valid := false.B @@ -891,12 +891,12 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In when (cntl_valid && cntl.perform_single_preload) { mesh.io.a.bits := Mux(a_should_be_fed_into_transposer, dataA.asUInt, 0.U).asTypeOf(Vec(meshRows, Vec(tileRows, inputType))) - mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, dataB.asUInt, 0.U).asTypeOf(Vec(meshRows, Vec(tileRows, inputType))) + mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, dataB.asUInt, 0.U).asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType))) } when (cntl_valid && cntl.perform_single_mul) { mesh.io.a.bits := Mux(a_should_be_fed_into_transposer, 0.U, dataA.asUInt).asTypeOf(Vec(meshRows, Vec(tileRows, inputType))) - mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, 0.U, dataB.asUInt).asTypeOf(Vec(meshRows, Vec(tileRows, inputType))) + mesh.io.b.bits := Mux(b_should_be_fed_into_transposer, 0.U, dataB.asUInt).asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType))) mesh.io.req.bits.tag.addr.make_this_garbage() } diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index beb46c71..45b6a778 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -58,8 +58,6 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( mvin_scale_shared: Boolean = false, acc_scale_args: Option[ScaleArguments[T, V]] = None, - pe_latency: Int = 0, - acc_read_full_width: Boolean = true, acc_read_small_width: Boolean = true, use_dedicated_tl_port: Boolean = true, @@ -76,8 +74,11 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( hardcode_d_to_garbage_addr: Boolean = false, use_shared_tlb: Boolean = true, + tile_latency: Int = 0, mesh_output_delay: Int = 1, + use_tree_reduction_if_possible: Boolean = true, + num_counter: Int = 8, has_training_convs: Boolean = true, @@ -162,6 +163,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( val hasIm2Col = false + val tree_reduction = use_tree_reduction_if_possible && dataflow == Dataflow.WS && tileRows > 1 + //========================================================================== // sanity check mesh size //========================================================================== diff --git a/src/main/scala/gemmini/Mesh.scala b/src/main/scala/gemmini/Mesh.scala index 5bb924c5..cd056658 100644 --- a/src/main/scala/gemmini/Mesh.scala +++ b/src/main/scala/gemmini/Mesh.scala @@ -15,7 +15,7 @@ import chisel3.experimental._ * @param meshColumns */ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, - df: Dataflow.Value, pe_latency: Int, + df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int, max_simultaneous_matmuls: Int, output_delay: Int, val tileRows: Int, val tileColumns: Int, val meshRows: Int, val meshColumns: Int) extends Module { @@ -34,43 +34,54 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, val out_id = Output(Vec(meshColumns, Vec(tileColumns, UInt(log2Up(max_simultaneous_matmuls).W)))) val out_last = Output(Vec(meshColumns, Vec(tileColumns, Bool()))) }) + // mesh(r)(c) => Tile at row r, column c - val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, outputType, accType, df, pe_latency, max_simultaneous_matmuls, tileRows, tileColumns))) + val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, outputType, accType, df, tree_reduction, max_simultaneous_matmuls, tileRows, tileColumns))) val meshT = mesh.transpose + + def pipe[T <: Data](valid: Bool, t: T, latency: Int): T = { + // The default "Pipe" function apparently resets the valid signals to false.B. We would like to avoid using global + // signals in the Mesh, so over here, we make it clear that the reset signal will never be asserted + chisel3.withReset(false.B) { Pipe(valid, t, latency).bits } + } + // Chain tile_a_out -> tile_a_in (pipeline a across each row) // TODO clock-gate A signals with in_garbage for (r <- 0 until meshRows) { mesh(r).foldLeft(io.in_a(r)) { case (in_a, tile) => - tile.io.in_a := RegNext(in_a) + tile.io.in_a := ShiftRegister(in_a, tile_latency+1) tile.io.out_a } } + // Chain tile_out_b -> tile_b_in (pipeline b across each column) for (c <- 0 until meshColumns) { meshT(c).foldLeft((io.in_b(c), io.in_valid(c))) { case ((in_b, valid), tile) => - tile.io.in_b := RegEnable(in_b, valid.head) + tile.io.in_b := pipe(valid.head, in_b, tile_latency+1) (tile.io.out_b, tile.io.out_valid) } } + // Chain tile_out -> tile_propag (pipeline output across each column) for (c <- 0 until meshColumns) { meshT(c).foldLeft((io.in_d(c), io.in_valid(c))) { case ((in_propag, valid), tile) => - tile.io.in_d := RegEnable(in_propag, valid.head) + tile.io.in_d := pipe(valid.head, in_propag, tile_latency+1) (tile.io.out_c, tile.io.out_valid) } } + // Chain control signals (pipeline across each column) assert(!(mesh.map(_.map(_.io.bad_dataflow).reduce(_||_)).reduce(_||_))) for (c <- 0 until meshColumns) { meshT(c).foldLeft((io.in_control(c), io.in_valid(c))) { case ((in_ctrl, valid), tile) => (tile.io.in_control, in_ctrl, valid).zipped.foreach { case (tile_ctrl, ctrl, v) => - tile_ctrl.shift := RegEnable(ctrl.shift, v) - tile_ctrl.dataflow := RegEnable(ctrl.dataflow, v) - tile_ctrl.propagate := RegEnable(ctrl.propagate, v) + tile_ctrl.shift := pipe(v, ctrl.shift, tile_latency+1) + tile_ctrl.dataflow := pipe(v, ctrl.dataflow, tile_latency+1) + tile_ctrl.propagate := pipe(v, ctrl.propagate, tile_latency+1) } (tile.io.out_control, tile.io.out_valid) } @@ -80,7 +91,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, for (c <- 0 until meshColumns) { meshT(c).foldLeft(io.in_valid(c)) { case (in_v, tile) => - tile.io.in_valid := RegNext(in_v) + tile.io.in_valid := ShiftRegister(in_v, tile_latency+1) tile.io.out_valid } } @@ -89,7 +100,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, for (c <- 0 until meshColumns) { meshT(c).foldLeft(io.in_id(c)) { case (in_id, tile) => - tile.io.in_id := RegNext(in_id) + tile.io.in_id := ShiftRegister(in_id, tile_latency+1) tile.io.out_id } } @@ -98,7 +109,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, for (c <- 0 until meshColumns) { meshT(c).foldLeft(io.in_last(c)) { case (in_last, tile) => - tile.io.in_last := RegNext(in_last) + tile.io.in_last := ShiftRegister(in_last, tile_latency+1) tile.io.out_last } } diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala index acab135d..db40debf 100644 --- a/src/main/scala/gemmini/MeshWithDelays.scala +++ b/src/main/scala/gemmini/MeshWithDelays.scala @@ -33,7 +33,7 @@ class MeshWithDelaysResp[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](o class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] (inputType: T, val outputType: T, accType: T, - tagType: U, df: Dataflow.Value, pe_latency: Int, output_delay: Int, + tagType: U, df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int, output_delay: Int, tileRows: Int, tileColumns: Int, meshRows: Int, meshColumns: Int, leftBanks: Int, upBanks: Int, outBanks: Int = 1, n_simultaneous_matmuls: Int = -1) extends Module { @@ -47,12 +47,13 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] assert(meshRows*tileRows == meshColumns*tileColumns) val block_size = meshRows*tileRows + val latency_per_pe = (tile_latency + 1).toFloat / (tileRows min tileColumns) val max_simultaneous_matmuls = if (n_simultaneous_matmuls == -1) { - 5 * (pe_latency + 1) + (5 * latency_per_pe).ceil.toInt } else { n_simultaneous_matmuls } - assert(max_simultaneous_matmuls >= 5 * (pe_latency + 1)) + assert(max_simultaneous_matmuls >= 5 * latency_per_pe) val tagqlen = max_simultaneous_matmuls+1 @@ -70,7 +71,6 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] def shifted[T <: Data](x: Vec[Vec[T]], banks: Int, reverse: Boolean = false) = { assert(x.size % banks == 0, "cannot bank without clean divisors") - assert(pe_latency == 0 || (tileRows == 1 && tileColumns == 1), "If tiles are larger than 1x1, then PEs must have 0 latency") val banked_len = x.size / banks val banked_x = x.grouped(banked_len).toSeq @@ -79,13 +79,13 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] (banked_x zip indexes).flatMap { case (bx, i) => val bxVec = VecInit(bx) - val sram_shift = i * banked_len * (pe_latency+1) + val sram_shift = i * banked_len * (tile_latency+1) val SRAMShifted = Shifter(bxVec, sram_shift, true.B, true) val indexes = if (reverse) SRAMShifted.indices.reverse else SRAMShifted.indices val RegShifted = (SRAMShifted zip indexes).map { case (srs, j) => - ShiftRegister(srs, j*(pe_latency+1)) + ShiftRegister(srs, j*(tile_latency+1)) } RegShifted @@ -166,25 +166,25 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] val transposer_out = VecInit(transposer.io.outCol.bits.grouped(tileRows).map(t => VecInit(t)).toSeq) // Wire up mesh's IO to this module's IO - val mesh = Module(new Mesh(inputType, outputType, accType, df, pe_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns)) + val mesh = Module(new Mesh(inputType, outputType, accType, df, tree_reduction, tile_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns)) // TODO wire only to *_buf here, instead of io.*.bits - val a_shifter_in = WireInit(Mux(a_is_from_transposer, transposer_out, a_buf)) - val b_shifter_in = WireInit(Mux(b_is_from_transposer, transposer_out, b_buf)) + val a_shifter_in = WireInit(Mux(a_is_from_transposer, transposer_out.asTypeOf(A_TYPE), a_buf)) + val b_shifter_in = WireInit(Mux(b_is_from_transposer, transposer_out.asTypeOf(B_TYPE), b_buf)) val d_shifter_in = WireInit(Mux(d_is_from_transposer, - VecInit(transposer_out.flatten.reverse.grouped(tileRows).map(VecInit(_)).toSeq), d_buf)) + VecInit(transposer_out.flatten.reverse.grouped(tileRows).map(VecInit(_)).toSeq).asTypeOf(D_TYPE), d_buf)) mesh.io.in_a := shifted(a_shifter_in, leftBanks) mesh.io.in_b := shifted(b_shifter_in, upBanks) mesh.io.in_d := shifted(d_shifter_in, upBanks) mesh.io.in_control.zipWithIndex.foreach { case (ss, i) => - ss.foreach(_.dataflow := ShiftRegister(req.bits.pe_control.dataflow, i * (pe_latency + 1))) - ss.foreach(_.propagate := ShiftRegister(in_prop, i * (pe_latency + 1))) + ss.foreach(_.dataflow := ShiftRegister(req.bits.pe_control.dataflow, i * (tile_latency + 1))) + ss.foreach(_.propagate := ShiftRegister(in_prop, i * (tile_latency + 1))) } val result_shift = RegNext(req.bits.pe_control.shift) // TODO will this arrive at the right time if memory isn't pipelined? mesh.io.in_control.zipWithIndex.foreach { case (ctrl, i) => - ctrl.foreach(_.shift := ShiftRegister(result_shift, i * (pe_latency + 1))) + ctrl.foreach(_.shift := ShiftRegister(result_shift, i * (tile_latency + 1))) } val not_paused_vec = VecInit(Seq.fill(meshColumns)(VecInit(Seq.fill(tileColumns)(!pause)))) @@ -198,8 +198,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] // We want to output C when we're output-stationary, but B when we're weight-stationary // TODO these would actually overlap when we switch from output-stationary to weight-stationary - val out_pe_control = shifted(mesh.io.out_control, outBanks, reverse = true)(0)(0) - io.resp.bits.data := shifted(Mux(out_pe_control.dataflow === Dataflow.OS.id.U, mesh.io.out_c, mesh.io.out_b), outBanks, true) + io.resp.bits.data := shifted(Mux(mesh.io.out_control(0)(0).dataflow === Dataflow.OS.id.U, mesh.io.out_c, mesh.io.out_b), outBanks, true) io.resp.valid := shifted(mesh.io.out_valid, outBanks, reverse = true)(0)(0) diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala index 79944b72..e10318a3 100644 --- a/src/main/scala/gemmini/PE.scala +++ b/src/main/scala/gemmini/PE.scala @@ -17,7 +17,7 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle { * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh. * @param width Data width of operands */ -class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, latency: Int, max_simultaneous_matmuls: Int) +class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, max_simultaneous_matmuls: Int) (implicit ev: Arithmetic[T]) extends Module { // Debugging variables import ev._ @@ -46,17 +46,17 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, val cType = if (df == Dataflow.WS) inputType else accType - val a = ShiftRegister(io.in_a, latency) - val b = ShiftRegister(io.in_b, latency) - val d = ShiftRegister(io.in_d, latency) + val a = io.in_a + val b = io.in_b + val d = io.in_d val c1 = Reg(cType) val c2 = Reg(cType) - val dataflow = ShiftRegister(io.in_control.dataflow, latency) - val prop = ShiftRegister(io.in_control.propagate, latency) - val shift = ShiftRegister(io.in_control.shift, latency) - val id = ShiftRegister(io.in_id, latency) - val last = ShiftRegister(io.in_last, latency) - val valid = ShiftRegister(io.in_valid, latency) // TODO should we clockgate the rest of the ShiftRegisters based on the values in this ShiftRegisters + val dataflow = io.in_control.dataflow + val prop = io.in_control.propagate + val shift = io.in_control.shift + val id = io.in_id + val last = io.in_last + val valid = io.in_valid io.out_a := a io.out_control.dataflow := dataflow diff --git a/src/main/scala/gemmini/Tile.scala b/src/main/scala/gemmini/Tile.scala index 59807893..9c2a418c 100644 --- a/src/main/scala/gemmini/Tile.scala +++ b/src/main/scala/gemmini/Tile.scala @@ -4,6 +4,7 @@ package gemmini import chisel3._ import chisel3.util._ +import Util._ /** * A Tile is a purely combinational 2D array of passThrough PEs. @@ -12,7 +13,7 @@ import chisel3.util._ * @param rows Number of PEs on each row * @param columns Number of PEs on each column */ -class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df: Dataflow.Value, pe_latency: Int, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int) extends Module { +class Tile[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, tree_reduction: Boolean, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int)(implicit ev: Arithmetic[T]) extends Module { val io = IO(new Bundle { val in_a = Input(Vec(rows, inputType)) val in_b = Input(Vec(columns, outputType)) // This is the output of the tile next to it @@ -32,11 +33,13 @@ class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df: val in_valid = Input(Vec(columns, Bool())) val out_valid = Output(Vec(columns, Bool())) - + val bad_dataflow = Output(Bool()) }) - val tile = Seq.fill(rows, columns)(Module(new PE(inputType, outputType, accType, df, pe_latency, max_simultaneous_matmuls))) + import ev._ + + val tile = Seq.fill(rows, columns)(Module(new PE(inputType, outputType, accType, df, max_simultaneous_matmuls))) val tileT = tile.transpose // TODO: abstract hori/vert broadcast, all these connections look the same @@ -53,7 +56,7 @@ class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df: for (c <- 0 until columns) { tileT(c).foldLeft(io.in_b(c)) { case (in_b, pe) => - pe.io.in_b := in_b + pe.io.in_b := (if (tree_reduction) in_b.zero else in_b) pe.io.out_b } } @@ -106,11 +109,19 @@ class Tile[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, df: // Drive the Tile's bottom IO for (c <- 0 until columns) { io.out_c(c) := tile(rows-1)(c).io.out_c - io.out_b(c) := tile(rows-1)(c).io.out_b io.out_control(c) := tile(rows-1)(c).io.out_control io.out_id(c) := tile(rows-1)(c).io.out_id io.out_last(c) := tile(rows-1)(c).io.out_last io.out_valid(c) := tile(rows-1)(c).io.out_valid + + io.out_b(c) := { + if (tree_reduction) { + val prods = tileT(c).map(_.io.out_b) + accumulateTree(prods :+ io.in_b(c)) + } else { + tile(rows - 1)(c).io.out_b + } + } } io.bad_dataflow := tile.map(_.map(_.io.bad_dataflow).reduce(_||_)).reduce(_||_) diff --git a/src/main/scala/gemmini/Util.scala b/src/main/scala/gemmini/Util.scala index 511cfee2..907c4ad2 100644 --- a/src/main/scala/gemmini/Util.scala +++ b/src/main/scala/gemmini/Util.scala @@ -109,6 +109,22 @@ object Util { Mux(u1 < u2, u1, u2) } + def accumulateTree[T <: Data](xs: Seq[T])(implicit ev: Arithmetic[T]): T = { + import ev._ + + assert(xs.nonEmpty, "can't accumulate 0 elements") + + if (xs.length == 1) { + xs.head + } else { + val upperRowLen = 1 << log2Ceil(xs.length) + val upperRow = xs.padTo(upperRowLen, xs.head.zero) + val pairs = upperRow.grouped(2) + val lowerRow = pairs.map { case Seq(a, b) => a + b } + accumulateTree(lowerRow.toSeq) + } + } + // An undirectioned Valid bundle class UDValid[T <: Data](t: T) extends Bundle { val valid = Bool() From 6f45fcc027442087ffbe8aaacd151e171b60b0c0 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 4 Dec 2021 22:04:52 -0800 Subject: [PATCH 08/11] Optimize conv layers with few input channels (#92) We add a new experimental "pixel_repeats" feature to optimize conv layers with few input channels (like the first layer of most CNNs). --- SPIKE.hash | 2 +- software/gemmini-rocc-tests | 2 +- src/main/scala/gemmini/Controller.scala | 4 +- src/main/scala/gemmini/DMA.scala | 7 + src/main/scala/gemmini/GemminiConfigs.scala | 10 +- src/main/scala/gemmini/GemminiISA.scala | 15 +- src/main/scala/gemmini/LoadController.scala | 15 +- src/main/scala/gemmini/LocalAddr.scala | 15 ++ src/main/scala/gemmini/LoopConv.scala | 31 ++++- src/main/scala/gemmini/PixelRepeater.scala | 95 +++++++++++++ .../scala/gemmini/ReservationStation.scala | 17 ++- src/main/scala/gemmini/Scratchpad.scala | 130 +++++++++++++----- .../gemmini/VectorScalarMultiplier.scala | 10 +- src/main/scala/gemmini/XactTracker.scala | 2 + 14 files changed, 294 insertions(+), 61 deletions(-) create mode 100644 src/main/scala/gemmini/PixelRepeater.scala diff --git a/SPIKE.hash b/SPIKE.hash index ce15e697..a96811da 100644 --- a/SPIKE.hash +++ b/SPIKE.hash @@ -1 +1 @@ -34741e07bc6b56f1762ce579537948d58e28cd5a +02e2d983cc8e2c385ebe920302c427b9167bd76e diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 3aaa2307..5fa954ee 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 3aaa230733a9eba6edf4d14243d84595e017522f +Subproject commit 5fa954ee9cf97483cd9c765d9f4c664d1701090d diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index 3e74af93..74f23b4c 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -137,12 +137,12 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val (conv_cmd, loop_conv_unroller_busy) = withClock (gated_clock) { LoopConv(raw_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, - new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), + new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits, pixel_repeats_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ConfigMvoutRs2(acc_scale_t_bits, 32), new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), - has_training_convs, has_max_pool) } + has_training_convs, has_max_pool, has_first_layer_optimizations) } val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.ld_utilization, reservation_station.io.st_utilization, reservation_station.io.ex_utilization, meshRows*tileRows, coreMaxAddrBits, rob_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index c1cb51ef..5952be5b 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -27,6 +27,7 @@ class StreamReadRequest[U <: Data](spad_rows: Int, acc_rows: Int, mvin_scale_t_b val status = new MStatus val len = UInt(16.W) // TODO magic number val repeats = UInt(16.W) // TODO magic number + val pixel_repeats = UInt(8.W) // TODO magic number val block_stride = UInt(16.W) // TODO magic number val cmd_id = UInt(8.W) // TODO magic number @@ -43,6 +44,8 @@ class StreamReadResponse[U <: Data](spadWidth: Int, accWidth: Int, spad_rows: In val has_acc_bitwidth = Bool() val scale = UInt(mvin_scale_t_bits.W) val repeats = UInt(16.W) // TODO magic number + val pixel_repeats = UInt(16.W) // TODO magic number + val len = UInt(16.W) // TODO magic number val last = Bool() val bytes_read = UInt(8.W) // TODO magic number val cmd_id = UInt(8.W) // TODO magic number @@ -100,6 +103,8 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T io.resp.bits.has_acc_bitwidth := beatPacker.io.out.bits.has_acc_bitwidth io.resp.bits.scale := RegEnable(xactTracker.io.peek.entry.scale, beatPacker.io.req.fire()) io.resp.bits.repeats := RegEnable(xactTracker.io.peek.entry.repeats, beatPacker.io.req.fire()) + io.resp.bits.pixel_repeats := RegEnable(xactTracker.io.peek.entry.pixel_repeats, beatPacker.io.req.fire()) + io.resp.bits.len := RegEnable(xactTracker.io.peek.entry.len, beatPacker.io.req.fire()) io.resp.bits.cmd_id := RegEnable(xactTracker.io.peek.entry.cmd_id, beatPacker.io.req.fire()) io.resp.bits.bytes_read := RegEnable(xactTracker.io.peek.entry.bytes_to_read, beatPacker.io.req.fire()) io.resp.bits.last := beatPacker.io.out.bits.last @@ -250,6 +255,8 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf io.reserve.entry.has_acc_bitwidth := req.has_acc_bitwidth io.reserve.entry.scale := req.scale io.reserve.entry.repeats := req.repeats + io.reserve.entry.pixel_repeats := req.pixel_repeats + io.reserve.entry.len := req.len io.reserve.entry.block_stride := req.block_stride io.reserve.entry.lg_len_req := DontCare // TODO just remove this from the IO completely io.reserve.entry.bytes_to_read := read_bytes_read diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 45b6a778..be1b084f 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -85,6 +85,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( has_max_pool: Boolean = true, has_nonlinear_activations: Boolean = true, + has_first_layer_optimizations: Boolean = true, + use_firesim_simulation_counters: Boolean = false, use_shared_ext_mem: Boolean = false, @@ -159,7 +161,9 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( val mvout_rows_bits = log2Up(meshRows * tileRows + 1) val load_states = 3 - val block_stride_bits = 16 + val block_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries)) + + val pixel_repeats_bits = 8 min log2Up(meshColumns * tileColumns + 1) val hasIm2Col = false @@ -471,6 +475,10 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( header ++= s"#define ACC_READ_FULL_WIDTH\n" header ++= s"\n" + if (has_first_layer_optimizations) { + header ++= "#define HAS_FIRST_LAYER_OPTIMIZATIONS\n\n" + } + header ++= s"#endif // $guard\n" header.toString() } diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala index c85b6816..0b28316d 100644 --- a/src/main/scala/gemmini/GemminiISA.scala +++ b/src/main/scala/gemmini/GemminiISA.scala @@ -24,7 +24,7 @@ object GemminiISA { val LOAD3_CMD = 14.U // TODO add orows and ocols to this as well - val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120 | no_pool, downsample, input_dilated, act + val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120, max_pixels_per_row | no_pool, downsample, input_dilated, act val LOOP_CONV_WS_CONFIG_1 = 16.U // batch_size, in_dim, in_channels, out_channels | out_dim, pool_out_dim, stride, padding val LOOP_CONV_WS_CONFIG_2 = 17.U // kernel_dim, pool_size, pool_stride, pool_padding | batches, porows, pocols, pochs val LOOP_CONV_WS_CONFIG_3 = 18.U // krows, kcols, kchs, lpad | rpad, upad, dpad, plpad @@ -95,22 +95,25 @@ object GemminiISA { val CONFIG_MVIN_RS1_UNUSED_WIDTH = 2 val CONFIG_MVIN_RS1_SHRINK_WIDTH = 1 val CONFIG_MVIN_RS1_STATE_ID_WIDTH = 2 - val CONFIG_MVIN_RS1_SPACER_WIDTH = (16 - 2 - 1 - 2) + val CONFIG_MVIN_RS1_SPACER_WIDTH = 8 - 2 - 1 - 2 + val CONFIG_MVIN_RS1_PIXEL_REPEAT_WIDTH = 8 val CONFIG_MVIN_RS1_STRIDE_WIDTH = 16 val CONFIG_MVIN_RS1_SCALE_WIDTH = 32 - class ConfigMvinRs1(scale_bits: Int, stride_bits: Int) extends Bundle { - val _spacer2 = UInt((CONFIG_MVIN_RS1_SCALE_WIDTH - scale_bits).W) + class ConfigMvinRs1(scale_bits: Int, stride_bits: Int, pixel_repeat_bits: Int) extends Bundle { + val _spacer3 = UInt((CONFIG_MVIN_RS1_SCALE_WIDTH - scale_bits).W) val scale = UInt(scale_bits.W) - val _spacer1 = UInt((CONFIG_MVIN_RS1_STRIDE_WIDTH - stride_bits).W) + val _spacer2 = UInt((CONFIG_MVIN_RS1_STRIDE_WIDTH - stride_bits).W) val stride = UInt(stride_bits.W) + val _spacer1 = UInt((CONFIG_MVIN_RS1_PIXEL_REPEAT_WIDTH - pixel_repeat_bits).W) + val pixel_repeats = UInt(pixel_repeat_bits.W) val _spacer0 = UInt(CONFIG_MVIN_RS1_SPACER_WIDTH.W) val state_id = UInt(CONFIG_MVIN_RS1_STATE_ID_WIDTH.W) val shrink = UInt(CONFIG_MVIN_RS1_SHRINK_WIDTH.W) val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W) override def cloneType: ConfigMvinRs1.this.type = - (new ConfigMvinRs1(scale_bits, stride_bits)).asInstanceOf[this.type] + (new ConfigMvinRs1(scale_bits, stride_bits, pixel_repeat_bits)).asInstanceOf[this.type] } val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2 diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala index 89f7be7c..49d7b409 100644 --- a/src/main/scala/gemmini/LoadController.scala +++ b/src/main/scala/gemmini/LoadController.scala @@ -34,6 +34,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig val scales = Reg(Vec(load_states, UInt(mvin_scale_t_bits.W))) val shrinks = Reg(Vec(load_states, Bool())) // Shrink inputs to accumulator val block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W))) // Spad stride during block move-ins + val pixel_repeats = Reg(Vec(load_states, UInt(pixel_repeats_bits.W))) val block_rows = meshRows * tileRows val block_cols = meshColumns * tileColumns val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) @@ -47,11 +48,13 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig val rows = mvin_rs2.num_rows val config_stride = cmd.bits.cmd.rs2 - val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits)) - val config_scale = config_mvin_rs1.scale // maybe limit width to `mvin_scale_t_bits`? + val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits, pixel_repeats_bits)) + + val config_scale = config_mvin_rs1.scale val config_shrink = config_mvin_rs1.shrink val config_block_stride = config_mvin_rs1.stride + val config_pixel_repeats = config_mvin_rs1.pixel_repeats val mstatus = cmd.bits.cmd.status @@ -64,6 +67,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig val scale = scales(state_id) val shrink = shrinks(state_id) val block_stride = block_strides(state_id) + val pixel_repeat = pixel_repeats(state_id) val all_zeros = vaddr === 0.U @@ -104,6 +108,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig io.dma.req.bits.has_acc_bitwidth := localaddr_plus_row_counter.is_acc_addr && !shrink io.dma.req.bits.all_zeros := all_zeros io.dma.req.bits.status := mstatus + io.dma.req.bits.pixel_repeats := pixel_repeat // Command tracker IO cmd_tracker.io.alloc.valid := control_state === waiting_for_command && cmd.valid && DoLoad @@ -140,6 +145,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig scale := config_scale shrink := config_shrink block_stride := config_block_stride + pixel_repeat := Mux(config_pixel_repeats === 0.U, 1.U, config_pixel_repeats) // TODO this default value was just added to maintain backwards compatibility. we should deprecate and remove it later cmd.ready := true.B } @@ -165,6 +171,10 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig } } + // Optimizations based on config parameters + if (!has_first_layer_optimizations) + pixel_repeats.foreach(_ := 1.U) + // Performance counter CounterEventIO.init(io.counter) io.counter.connectEventSignal(CounterEvent.LOAD_ACTIVE_CYCLE, control_state === sending_rows) @@ -177,4 +187,5 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig // Assertions assert(!(cmd_tracker.io.alloc.fire() && cmd_tracker.io.alloc.bits.bytes_to_read === 0.U), "A single mvin instruction must load more than 0 bytes") + assert(has_first_layer_optimizations.B || !(cmd.valid && DoConfig && config_pixel_repeats > 1.U), "If first-layer optimizations are not enabled, then pixel-repeats cannot be greater than 1") } diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala index cce6bcae..ac5a1f4a 100644 --- a/src/main/scala/gemmini/LocalAddr.scala +++ b/src/main/scala/gemmini/LocalAddr.scala @@ -16,6 +16,8 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en private val accBankBits = log2Up(acc_banks) val accBankRowBits = log2Up(acc_bank_entries) + val spRows = sp_banks * sp_bank_entries + val is_acc_addr = Bool() val accumulate = Bool() val read_full_acc_row = Bool() @@ -71,6 +73,19 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en (result, overflow) } + // This function can only be used with non-accumulator addresses. Returns both new address and underflow + def floorSub(other: UInt, floor: UInt): (LocalAddr, Bool) = { + require(isPow2(sp_bank_entries)) // TODO remove this requirement + require(isPow2(acc_bank_entries)) // TODO remove this requirement + + val underflow = data < (floor +& other) + + val result = WireInit(this) + result.data := Mux(underflow, floor, data - other) + + (result, underflow) + } + def make_this_garbage(dummy: Int = 0): Unit = { is_acc_addr := true.B accumulate := true.B diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 1f27f3ff..d2775a9c 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -138,6 +138,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi config_cmd_rs1 := DontCare config_cmd_rs1.scale := MVIN_SCALE_IDENTITY config_cmd_rs1.stride := req.derived_params.bias_spad_stride + config_cmd_rs1.pixel_repeats := 1.U config_cmd_rs1.state_id := 2.U config_cmd_rs1.shrink := 0.U config_cmd_rs1._unused := 1.U @@ -217,6 +218,7 @@ class LoopConvLdInputReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: val addr_start = UInt(log2Up(max_acc_addr).W) val dram_addr = UInt(coreMaxAddrBits.W) val downsample = Bool() + val max_pixels_per_row = UInt(small_iterator_bitwidth.W) val input_dilated = Bool() val trans_input_3120 = Bool() val loop_id = UInt(log2Up(concurrent_loops).W) @@ -310,10 +312,12 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw config_cmd_rs1 := DontCare config_cmd_rs1.scale := MVIN_SCALE_IDENTITY config_cmd_rs1.stride := input_spad_stride + config_cmd_rs1.pixel_repeats := req.max_pixels_per_row config_cmd_rs1.state_id := 0.U config_cmd_rs1.shrink := 0.U config_cmd_rs1._unused := 1.U config_cmd.rs1 := config_cmd_rs1.asUInt() + config_cmd.rs2 := dram_stride << req.downsample val mvin_cmd = Wire(new RoCCCommand) @@ -476,14 +480,17 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit val config_cmd = Wire(new RoCCCommand) config_cmd := DontCare config_cmd.inst.funct := CONFIG_CMD + val config_cmd_rs1 = Wire(config_mvin_rs1_t.cloneType) config_cmd_rs1 := DontCare config_cmd_rs1.scale := MVIN_SCALE_IDENTITY config_cmd_rs1.stride := req.derived_params.weight_spad_stride + config_cmd_rs1.pixel_repeats := 1.U config_cmd_rs1.state_id := 1.U config_cmd_rs1.shrink := 0.U config_cmd_rs1._unused := 1.U config_cmd.rs1 := config_cmd_rs1.asUInt + config_cmd.rs2 := dram_stride val mvin_cmd = Wire(new RoCCCommand) @@ -561,6 +568,7 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi val c_addr_start = UInt(log2Up(max_acc_addr).W) val wrot180 = Bool() val downsample = Bool() + val max_pixels_per_row = UInt(small_iterator_bitwidth.W) val input_dilated = Bool() val trans_weight_0132 = Bool() val trans_input_3120 = Bool() @@ -623,6 +631,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool() || ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool()) + val pixels = Mux(kcols - kcol > req.max_pixels_per_row, req.max_pixels_per_row, kcols - kcol) + val irow = undilated(orow * stride +& krow * kernel_dilation) val icol = undilated(ocol * stride +& kcol * kernel_dilation) @@ -630,7 +640,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera Mux(batches - b > block_size.U, block_size.U, batches - b), undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt(), (block_size.U << req.input_dilated).asUInt(), ocols - ocol))) val J = Mux(ochs - och > block_size.U, block_size.U, ochs - och) - val K = Mux(kchs - kch > block_size.U, block_size.U, kchs - kch) + val K = pixels * Mux(kchs - kch > block_size.U, block_size.U, kchs - kch) // Addresses val a_addr = Mux(req.trans_input_3120, @@ -768,7 +778,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val next_b = floorAdd(b, b_it, batches, next_orow === 0.U && next_ocol === 0.U) val next_kch = floorAdd(kch, block_size.U, kchs, next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) - val next_kcol = floorAdd(kcol, 1.U, kcols, + val next_kcol = floorAdd(kcol, req.max_pixels_per_row, kcols, next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) val next_krow = floorAdd(krow, 1.U, krows, next_kcol === 0.U && next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) @@ -1049,6 +1059,8 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s val trans_weight_0132 = Bool() val trans_input_3120 = Bool() + val max_pixels_per_row = UInt(small_iterator_bitwidth.W) + val configured = Bool() val running = Bool() @@ -1137,7 +1149,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, - has_training_convs: Boolean, has_max_pool: Boolean) + has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean) (implicit p: Parameters) extends Module { val large_iterator_bitwidth = 16 val small_iterator_bitwidth = 16 // 8 @@ -1289,6 +1301,12 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I is (LOOP_CONV_WS) { loop_being_configured.no_bias := cmd.bits.rs1(0) + // TODO we added a default value for max_pixels_per_row just to maintain backwards compatibility. we should deprecate and remove it later + val config_max_pixels_per_row = cmd.bits.rs1(15, 8) + loop_being_configured.max_pixels_per_row := Mux( + !has_first_layer_optimizations.B || config_max_pixels_per_row === 0.U, + 1.U, config_max_pixels_per_row) + loop_being_configured.wrot180 := has_training_convs.B && cmd.bits.rs1(1) loop_being_configured.input_dilated := has_training_convs.B && cmd.bits.rs2(2) loop_being_configured.trans_output_1203 := has_training_convs.B && cmd.bits.rs1(2) @@ -1344,6 +1362,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I ld_input.io.req.bits.addr_start := loop_requesting_ld_input.a_addr_start ld_input.io.req.bits.dram_addr := loop_requesting_ld_input.input_dram_addr ld_input.io.req.bits.downsample := loop_requesting_ld_input.downsample + ld_input.io.req.bits.max_pixels_per_row := loop_requesting_ld_input.max_pixels_per_row ld_input.io.req.bits.input_dilated := loop_requesting_ld_input.input_dilated ld_input.io.req.bits.trans_input_3120 := loop_requesting_ld_input.trans_input_3120 ld_input.io.req.bits.loop_id := loop_requesting_ld_input_id @@ -1383,6 +1402,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I ex.io.req.bits.c_addr_start := ex_c_addr_start ex.io.req.bits.wrot180 := loop_requesting_ex.wrot180 ex.io.req.bits.downsample := loop_requesting_ex.downsample + ex.io.req.bits.max_pixels_per_row := loop_requesting_ex.max_pixels_per_row ex.io.req.bits.input_dilated := loop_requesting_ex.input_dilated ex.io.req.bits.trans_weight_0132 := loop_requesting_ex.trans_weight_0132 ex.io.req.bits.trans_input_3120 := loop_requesting_ex.trans_input_3120 @@ -1465,13 +1485,14 @@ object LoopConv { max_addr: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, dma_max_bytes: Int, config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, - compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean) + compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean, + has_first_layer_optimizations: Boolean) (implicit p: Parameters): Tuple2[DecoupledIO[RoCCCommand], Bool] = { val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts, max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes, config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t, - compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool)) + compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool, has_first_layer_optimizations)) mod.io.in <> in mod.io.ld_utilization := ld_utilization diff --git a/src/main/scala/gemmini/PixelRepeater.scala b/src/main/scala/gemmini/PixelRepeater.scala new file mode 100644 index 00000000..0413304e --- /dev/null +++ b/src/main/scala/gemmini/PixelRepeater.scala @@ -0,0 +1,95 @@ +package gemmini + +import chisel3._ +import chisel3.util._ + +import Util._ + +class PixelRepeaterReq[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, tag_t: Tag) extends Bundle { + val in: Vec[T] = Vec(block_cols, t.cloneType) + val mask: Vec[Bool] = Vec(block_cols, Bool()) + val laddr: LocalAddr = laddr_t.cloneType + val len: UInt = UInt(log2Up(block_cols+1).W) // TODO magic number + val pixel_repeats: UInt = UInt(8.W) // TODO magic number + val last: Bool = Bool() + val tag: Tag = tag_t.cloneType + + assert(block_cols <= 255, "len must be longer") + + override def cloneType: PixelRepeaterReq.this.type = new PixelRepeaterReq(t, laddr_t, block_cols, tag_t).asInstanceOf[this.type] +} + +class PixelRepeaterResp[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, tag_t: Tag) extends Bundle { + val out: Vec[T] = Vec(block_cols, t.cloneType) + val mask: Vec[Bool] = Vec(block_cols, Bool()) + val laddr: LocalAddr = laddr_t.cloneType + val last: Bool = Bool() + val tag: Tag = tag_t.cloneType + + override def cloneType: PixelRepeaterResp.this.type = new PixelRepeaterResp(t, laddr_t, block_cols, tag_t).asInstanceOf[this.type] +} + +class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, aligned_to: Int, tag_t: Tag, passthrough: Boolean) extends Module { + val io = IO(new Bundle { + val req = Flipped(Decoupled(new PixelRepeaterReq(t, laddr_t, block_cols, tag_t))) + val resp = Decoupled(new PixelRepeaterResp(t, laddr_t, block_cols, tag_t)) + }) + + if (passthrough) { + io.resp.valid := io.req.valid + io.resp.bits.out := io.req.bits.in + io.resp.bits.mask := io.req.bits.mask + io.resp.bits.laddr := io.req.bits.laddr + io.resp.bits.last := io.req.bits.last + io.resp.bits.tag := io.req.bits.tag + + io.req.ready := io.resp.ready + } else { + val req = Reg(UDValid(io.req.bits.cloneType)) + + io.req.ready := !req.valid || (io.resp.ready && req.bits.pixel_repeats === 0.U) + + val out_shift = Wire(UInt(log2Up(block_cols / 2 + 1).W)) + out_shift := req.bits.pixel_repeats * req.bits.len + + io.resp.bits.out := (req.bits.in.asUInt() << (out_shift * t.getWidth.U)).asTypeOf(io.resp.bits.out) + io.resp.bits.mask := (req.bits.mask.asUInt() << (out_shift * ((t.getWidth / 8) / aligned_to).U)).asTypeOf(io.resp.bits.mask) + + io.resp.bits.last := req.bits.last && (req.bits.pixel_repeats === 0.U) + io.resp.bits.tag := req.bits.tag + + val is_acc_addr = req.bits.laddr.is_acc_addr + assert(!(req.valid && is_acc_addr && req.bits.pixel_repeats > 0.U)) + + val sp_addr = Mux(req.bits.laddr.full_sp_addr() < (laddr_t.spRows / 2).U, + req.bits.laddr.floorSub(req.bits.pixel_repeats, 0.U)._1, + req.bits.laddr.floorSub(req.bits.pixel_repeats, (laddr_t.spRows / 2).U)._1, + ) + + val underflow = !is_acc_addr && Mux(req.bits.laddr.full_sp_addr() < (laddr_t.spRows / 2).U, + req.bits.laddr.floorSub(req.bits.pixel_repeats, 0.U)._2, + req.bits.laddr.floorSub(req.bits.pixel_repeats, (laddr_t.spRows / 2).U)._2, + ) + + io.resp.bits.laddr := Mux(is_acc_addr, req.bits.laddr, sp_addr) + + io.resp.valid := req.valid && !underflow + + when(io.resp.fire() || underflow) { + req.bits.pixel_repeats := req.bits.pixel_repeats - 1.U + + when(req.bits.pixel_repeats === 0.U) { + req.pop() + } + } + + when(io.req.fire()) { + req.push(io.req.bits) + req.bits.pixel_repeats := io.req.bits.pixel_repeats - 1.U + } + + when(reset.toBool()) { + req.pop() + } + } +} diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index 929685f6..44b992ae 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -115,7 +115,6 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val solitary_preload = utilization === 1.U && entries.map(e => e.valid && e.bits.cmd.inst.funct === PRELOAD_CMD).reduce(_ || _) io.busy := !empty && !(solitary_preload && io.solitary_preload) - // Config values set by programmer val a_stride = Reg(UInt(16.W)) // TODO magic numbers val c_stride = Reg(UInt(16.W)) // TODO magic numbers @@ -123,6 +122,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val ld_block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W))) val st_block_stride = block_rows.U val pooling_is_enabled = Reg(Bool()) + val ld_pixel_repeats = Reg(Vec(load_states, UInt(8.W))) // This is the ld_pixel_repeat MINUS ONE // TODO magic numbers val new_entry = Wire(new Entry) new_entry := DontCare @@ -245,6 +245,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val id = MuxCase(0.U, Seq((new_entry.cmd.inst.funct === LOAD2_CMD) -> 1.U, (new_entry.cmd.inst.funct === LOAD3_CMD) -> 2.U)) val block_stride = ld_block_strides(id) + val pixel_repeats = ld_pixel_repeats(id) val mvin_cols = cmd.rs2(32 + mvin_cols_bits - 1, 32) val mvin_rows = cmd.rs2(48 + mvin_rows_bits - 1, 48) @@ -252,6 +253,18 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val mvin_mats = mvin_cols / block_cols.U + (mvin_cols % block_cols.U =/= 0.U) val total_mvin_rows = ((mvin_mats - 1.U) * block_stride) + mvin_rows + // TODO We have to know how the LoopConv's internals work here. Our abstractions are leaking + if (has_first_layer_optimizations) { + val start = cmd.rs2(31, 0).asTypeOf(local_addr_t) + // TODO instead of using a floor-sub that's hardcoded to the Scratchpad bank boundaries, we should find some way of letting the programmer specify the start address + dst.bits.start := Mux(start.is_acc_addr, start, + Mux(start.full_sp_addr() > (local_addr_t.spRows / 2).U, + start.floorSub(pixel_repeats, (local_addr_t.spRows / 2).U)._1, + start.floorSub(pixel_repeats, 0.U)._1, + ) + ) + } + dst.bits.end := dst.bits.start + total_mvin_rows dst.bits.wraps_around := dst.bits.start.add_with_overflow(total_mvin_rows)._2 } @@ -365,7 +378,9 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G }.elsewhen(new_entry.is_config && new_entry.q === ldq) { val id = new_entry.cmd.rs1(4,3) // TODO magic numbers val block_stride = new_entry.cmd.rs1(31, 16) // TODO magic numbers + val repeat_pixels = new_entry.cmd.rs1(15, 8) // TODO magic numbers ld_block_strides(id) := block_stride + ld_pixel_repeats(id) := repeat_pixels - 1.U }.elsewhen(new_entry.is_config && new_entry.q === stq) { val pool_stride = new_entry.cmd.rs1(5, 4) // TODO magic numbers pooling_is_enabled := pool_stride =/= 0.U diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index 0d76758d..764b5d5a 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -20,6 +20,7 @@ class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits: val has_acc_bitwidth = Bool() val all_zeros = Bool() val block_stride = UInt(16.W) // TODO magic numbers + val pixel_repeats = UInt(8.W) // TODO magic numbers val cmd_id = UInt(8.W) // TODO don't use a magic number here val status = new MStatus @@ -256,7 +257,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, write_issue_q.io.enq.valid := false.B write_issue_q.io.enq.bits := write_scale_q.io.deq.bits - // Garbage can immediately fire between dispatch_q and scale_q when (write_dispatch_q.bits.laddr.is_garbage()) { write_scale_q.io.enq <> write_dispatch_q @@ -266,7 +266,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, write_issue_q.io.enq <> write_scale_q.io.deq } - val writeData = Wire(Valid(UInt((spad_w max acc_w).W))) writeData.valid := write_issue_q.io.deq.bits.laddr.is_garbage() writeData.bits := DontCare @@ -312,7 +311,20 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, zero_writer.io.req.bits.block_stride := io.dma.read.req.bits.block_stride zero_writer.io.req.bits.tag := io.dma.read.req.bits - zero_writer.io.resp.ready := false.B + // zero_writer.io.resp.ready := false.B + + val zero_writer_pixel_repeater = Module(new PixelRepeater(inputType, local_addr_t, block_cols, aligned_to, new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), passthrough = !has_first_layer_optimizations)) + zero_writer_pixel_repeater.io.req.valid := zero_writer.io.resp.valid + zero_writer_pixel_repeater.io.req.bits.in := 0.U.asTypeOf(Vec(block_cols, inputType)) + zero_writer_pixel_repeater.io.req.bits.mask := zero_writer.io.resp.bits.mask + zero_writer_pixel_repeater.io.req.bits.laddr := zero_writer.io.resp.bits.laddr + zero_writer_pixel_repeater.io.req.bits.len := zero_writer.io.resp.bits.tag.cols + zero_writer_pixel_repeater.io.req.bits.pixel_repeats := zero_writer.io.resp.bits.tag.pixel_repeats + zero_writer_pixel_repeater.io.req.bits.last := zero_writer.io.resp.bits.last + zero_writer_pixel_repeater.io.req.bits.tag := zero_writer.io.resp.bits.tag + + zero_writer.io.resp.ready := zero_writer_pixel_repeater.io.req.ready + zero_writer_pixel_repeater.io.resp.ready := false.B reader.module.io.req.valid := read_issue_q.io.deq.valid read_issue_q.io.deq.ready := reader.module.io.req.ready @@ -321,6 +333,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, read_issue_q.io.deq.bits.laddr.full_acc_addr(), read_issue_q.io.deq.bits.laddr.full_sp_addr()) reader.module.io.req.bits.len := read_issue_q.io.deq.bits.cols reader.module.io.req.bits.repeats := read_issue_q.io.deq.bits.repeats + reader.module.io.req.bits.pixel_repeats := read_issue_q.io.deq.bits.pixel_repeats reader.module.io.req.bits.scale := read_issue_q.io.deq.bits.scale reader.module.io.req.bits.is_acc := read_issue_q.io.deq.bits.laddr.is_acc_addr reader.module.io.req.bits.accumulate := read_issue_q.io.deq.bits.laddr.accumulate @@ -348,10 +361,22 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, mvin_scale_in.bits.in := reader.module.io.resp.bits.data.asTypeOf(chiselTypeOf(mvin_scale_in.bits.in)) mvin_scale_in.bits.scale := reader.module.io.resp.bits.scale.asTypeOf(mvin_scale_t) mvin_scale_in.bits.repeats := reader.module.io.resp.bits.repeats + mvin_scale_in.bits.pixel_repeats := reader.module.io.resp.bits.pixel_repeats mvin_scale_in.bits.last := reader.module.io.resp.bits.last mvin_scale_in.bits.tag := reader.module.io.resp.bits - mvin_scale_out.ready := false.B + val mvin_scale_pixel_repeater = Module(new PixelRepeater(inputType, local_addr_t, block_cols, aligned_to, mvin_scale_out.bits.tag.cloneType, passthrough = !has_first_layer_optimizations)) + mvin_scale_pixel_repeater.io.req.valid := mvin_scale_out.valid + mvin_scale_pixel_repeater.io.req.bits.in := mvin_scale_out.bits.out + mvin_scale_pixel_repeater.io.req.bits.mask := mvin_scale_out.bits.tag.mask take mvin_scale_pixel_repeater.io.req.bits.mask.size + mvin_scale_pixel_repeater.io.req.bits.laddr := mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row + mvin_scale_pixel_repeater.io.req.bits.len := mvin_scale_out.bits.tag.len + mvin_scale_pixel_repeater.io.req.bits.pixel_repeats := mvin_scale_out.bits.tag.pixel_repeats + mvin_scale_pixel_repeater.io.req.bits.last := mvin_scale_out.bits.last + mvin_scale_pixel_repeater.io.req.bits.tag := mvin_scale_out.bits.tag + + mvin_scale_out.ready := mvin_scale_pixel_repeater.io.req.ready + mvin_scale_pixel_repeater.io.resp.ready := false.B if (!mvin_scale_shared) { mvin_scale_acc_in.valid := reader.module.io.resp.valid && @@ -359,6 +384,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, mvin_scale_acc_in.bits.in := reader.module.io.resp.bits.data.asTypeOf(chiselTypeOf(mvin_scale_acc_in.bits.in)) mvin_scale_acc_in.bits.scale := reader.module.io.resp.bits.scale.asTypeOf(mvin_scale_acc_t) mvin_scale_acc_in.bits.repeats := reader.module.io.resp.bits.repeats + mvin_scale_acc_in.bits.pixel_repeats := 1.U mvin_scale_acc_in.bits.last := reader.module.io.resp.bits.last mvin_scale_acc_in.bits.tag := reader.module.io.resp.bits @@ -368,23 +394,33 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, reader.module.io.resp.ready := Mux(reader.module.io.resp.bits.is_acc && reader.module.io.resp.bits.has_acc_bitwidth, mvin_scale_acc_in.ready, mvin_scale_in.ready) - val mvin_scale_finished = mvin_scale_out.fire() && mvin_scale_out.bits.last + // val mvin_scale_finished = mvin_scale_out.fire() && mvin_scale_out.bits.last + val mvin_scale_finished = mvin_scale_pixel_repeater.io.resp.fire() && mvin_scale_pixel_repeater.io.resp.bits.last val mvin_scale_acc_finished = mvin_scale_acc_out.fire() && mvin_scale_acc_out.bits.last - val zero_writer_finished = zero_writer.io.resp.fire() && zero_writer.io.resp.bits.last + // val zero_writer_finished = zero_writer.io.resp.fire() && zero_writer.io.resp.bits.last + val zero_writer_finished = zero_writer_pixel_repeater.io.resp.fire() && zero_writer_pixel_repeater.io.resp.bits.last + /* val zero_writer_bytes_read = Mux(zero_writer.io.resp.bits.laddr.is_acc_addr, zero_writer.io.resp.bits.tag.cols * (accType.getWidth / 8).U, zero_writer.io.resp.bits.tag.cols * (inputType.getWidth / 8).U) + */ + val zero_writer_bytes_read = Mux(zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr, + zero_writer_pixel_repeater.io.resp.bits.tag.cols * (accType.getWidth / 8).U, + zero_writer_pixel_repeater.io.resp.bits.tag.cols * (inputType.getWidth / 8).U) // For DMA read responses, mvin_scale gets first priority, then mvin_scale_acc, and then zero_writer io.dma.read.resp.valid := mvin_scale_finished || mvin_scale_acc_finished || zero_writer_finished - io.dma.read.resp.bits.cmd_id := MuxCase(zero_writer.io.resp.bits.tag.cmd_id, Seq( - mvin_scale_finished -> mvin_scale_out.bits.tag.cmd_id, + // io.dma.read.resp.bits.cmd_id := MuxCase(zero_writer.io.resp.bits.tag.cmd_id, Seq( + io.dma.read.resp.bits.cmd_id := MuxCase(zero_writer_pixel_repeater.io.resp.bits.tag.cmd_id, Seq( + // mvin_scale_finished -> mvin_scale_out.bits.tag.cmd_id, + mvin_scale_finished -> mvin_scale_pixel_repeater.io.resp.bits.tag.cmd_id, mvin_scale_acc_finished -> mvin_scale_acc_out.bits.tag.cmd_id)) io.dma.read.resp.bits.bytesRead := MuxCase(zero_writer_bytes_read, Seq( - mvin_scale_finished -> mvin_scale_out.bits.tag.bytes_read, + // mvin_scale_finished -> mvin_scale_out.bits.tag.bytes_read, + mvin_scale_finished -> mvin_scale_pixel_repeater.io.resp.bits.tag.bytes_read, mvin_scale_acc_finished -> mvin_scale_acc_out.bits.tag.bytes_read)) io.tlb(0) <> writer.module.io.tlb @@ -465,16 +501,21 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bank_ios.zipWithIndex.foreach { case (bio, i) => val exwrite = io.srams.write(i).en - val laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row + // val laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row + val laddr = mvin_scale_pixel_repeater.io.resp.bits.laddr - val dmaread = mvin_scale_out.valid && !mvin_scale_out.bits.tag.is_acc && + // val dmaread = mvin_scale_out.valid && !mvin_scale_out.bits.tag.is_acc && + val dmaread = mvin_scale_pixel_repeater.io.resp.valid && !mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc && laddr.sp_bank() === i.U // We need to make sure that we don't try to return a dma read resp from both zero_writer and either mvin_scale // or mvin_acc_scale at the same time. The scalers always get priority in those cases - val zerowrite = zero_writer.io.resp.valid && !zero_writer.io.resp.bits.laddr.is_acc_addr && - zero_writer.io.resp.bits.laddr.sp_bank() === i.U && - !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) + /* val zerowrite = zero_writer.io.resp.valid && !zero_writer.io.resp.bits.laddr.is_acc_addr && + zero_writer.io.resp.bits.laddr.sp_bank() === i.U && */ + val zerowrite = zero_writer_pixel_repeater.io.resp.valid && !zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr && + zero_writer_pixel_repeater.io.resp.bits.laddr.sp_bank() === i.U && + // !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) + !((mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) bio.write.en := exwrite || dmaread || zerowrite @@ -484,21 +525,27 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.write.mask := io.srams.write(i).mask }.elsewhen (dmaread) { bio.write.addr := laddr.sp_row() - bio.write.data := mvin_scale_out.bits.out.asUInt() - bio.write.mask := mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1) + // bio.write.data := mvin_scale_out.bits.out.asUInt() + // bio.write.mask := mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1) + bio.write.data := mvin_scale_pixel_repeater.io.resp.bits.out.asUInt() + bio.write.mask := mvin_scale_pixel_repeater.io.resp.bits.mask take ((spad_w / (aligned_to * 8)) max 1) - mvin_scale_out.ready := true.B // TODO we combinationally couple valid and ready signals + // mvin_scale_out.ready := true.B // TODO we combinationally couple valid and ready signals + mvin_scale_pixel_repeater.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals }.elsewhen (zerowrite) { - bio.write.addr := zero_writer.io.resp.bits.laddr.sp_row() + // bio.write.addr := zero_writer.io.resp.bits.laddr.sp_row() + bio.write.addr := zero_writer_pixel_repeater.io.resp.bits.laddr.sp_row() bio.write.data := 0.U bio.write.mask := { val n = inputType.getWidth / 8 - val mask = zero_writer.io.resp.bits.mask + // val mask = zero_writer.io.resp.bits.mask + val mask = zero_writer_pixel_repeater.io.resp.bits.mask val expanded = VecInit(mask.flatMap(e => Seq.fill(n)(e))) expanded } - zero_writer.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals + // zero_writer.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals + zero_writer_pixel_repeater.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals }.otherwise { bio.write.addr := DontCare bio.write.data := DontCare @@ -636,10 +683,12 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, io.acc.write(i).ready := true.B assert(!(exwrite && !bio.write.ready), "Execute controller write to AccumulatorMem was skipped") - val from_mvin_scale = mvin_scale_out.valid && mvin_scale_out.bits.tag.is_acc + // val from_mvin_scale = mvin_scale_out.valid && mvin_scale_out.bits.tag.is_acc + val from_mvin_scale = mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc val from_mvin_scale_acc = mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.tag.is_acc - val mvin_scale_laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row + // val mvin_scale_laddr = mvin_scale_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_out.bits.row + val mvin_scale_laddr = mvin_scale_pixel_repeater.io.resp.bits.laddr val mvin_scale_acc_laddr = mvin_scale_acc_out.bits.tag.addr.asTypeOf(local_addr_t) + mvin_scale_acc_out.bits.row val dmaread_bank = Mux(from_mvin_scale, mvin_scale_laddr.acc_bank(), @@ -648,7 +697,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // We need to make sure that we don't try to return a dma read resp from both mvin_scale and mvin_scale_acc // at the same time. mvin_scale always gets priority in this cases - val spad_last = mvin_scale_out.valid && mvin_scale_out.bits.last && !mvin_scale_out.bits.tag.is_acc + // val spad_last = mvin_scale_out.valid && mvin_scale_out.bits.last && !mvin_scale_out.bits.tag.is_acc + val spad_last = mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last && !mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc val dmaread = (from_mvin_scale || from_mvin_scale_acc) && dmaread_bank === i.U /* && @@ -656,9 +706,13 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // We need to make sure that we don't try to return a dma read resp from both zero_writer and either mvin_scale // or mvin_acc_scale at the same time. The scalers always get priority in those cases - val zerowrite = zero_writer.io.resp.valid && zero_writer.io.resp.bits.laddr.is_acc_addr && - zero_writer.io.resp.bits.laddr.acc_bank() === i.U && - !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) + /* val zerowrite = zero_writer.io.resp.valid && zero_writer.io.resp.bits.laddr.is_acc_addr && + zero_writer.io.resp.bits.laddr.acc_bank() === i.U && */ + val zerowrite = zero_writer_pixel_repeater.io.resp.valid && zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr && + zero_writer_pixel_repeater.io.resp.bits.laddr.acc_bank() === i.U && + // !((mvin_scale_out.valid && mvin_scale_out.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) + !((mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last) || (mvin_scale_acc_out.valid && mvin_scale_acc_out.bits.last)) + val consecutive_write_block = RegInit(false.B) if (acc_singleported) { val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(acc_sub_banks)).W)) @@ -674,12 +728,15 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, } bio.write.valid := false.B - bio.write.bits.acc := MuxCase(zero_writer.io.resp.bits.laddr.accumulate, + // bio.write.bits.acc := MuxCase(zero_writer.io.resp.bits.laddr.accumulate, + bio.write.bits.acc := MuxCase(zero_writer_pixel_repeater.io.resp.bits.laddr.accumulate, Seq(exwrite -> io.acc.write(i).bits.acc, - from_mvin_scale -> mvin_scale_out.bits.tag.accumulate, + // from_mvin_scale -> mvin_scale_out.bits.tag.accumulate, + from_mvin_scale -> mvin_scale_pixel_repeater.io.resp.bits.tag.accumulate, from_mvin_scale_acc -> mvin_scale_acc_out.bits.tag.accumulate)) - bio.write.bits.addr := MuxCase(zero_writer.io.resp.bits.laddr.acc_row(), + // bio.write.bits.addr := MuxCase(zero_writer.io.resp.bits.laddr.acc_row(), + bio.write.bits.addr := MuxCase(zero_writer_pixel_repeater.io.resp.bits.laddr.acc_row(), Seq(exwrite -> io.acc.write(i).bits.addr, (from_mvin_scale || from_mvin_scale_acc) -> dmaread_row)) @@ -690,20 +747,23 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, }.elsewhen (dmaread && !spad_last && !consecutive_write_block) { bio.write.valid := true.B bio.write.bits.data := Mux(from_mvin_scale, - VecInit(mvin_scale_out.bits.out.map(e => e.withWidthOf(accType))).asTypeOf(acc_row_t), + // VecInit(mvin_scale_out.bits.out.map(e => e.withWidthOf(accType))).asTypeOf(acc_row_t), + VecInit(mvin_scale_pixel_repeater.io.resp.bits.out.map(e => e.withWidthOf(accType))).asTypeOf(acc_row_t), mvin_scale_acc_out.bits.out.asTypeOf(acc_row_t)) bio.write.bits.mask := Mux(from_mvin_scale, { val n = accType.getWidth / inputType.getWidth - val mask = mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1) + // val mask = mvin_scale_out.bits.tag.mask take ((spad_w / (aligned_to * 8)) max 1) + val mask = mvin_scale_pixel_repeater.io.resp.bits.mask take ((spad_w / (aligned_to * 8)) max 1) val expanded = VecInit(mask.flatMap(e => Seq.fill(n)(e))) expanded }, mvin_scale_acc_out.bits.tag.mask) when(from_mvin_scale) { - mvin_scale_out.ready := bio.write.ready + // mvin_scale_out.ready := bio.write.ready + mvin_scale_pixel_repeater.io.resp.ready := bio.write.ready }.otherwise { mvin_scale_acc_out.ready := bio.write.ready } @@ -712,12 +772,14 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.write.bits.data := 0.U.asTypeOf(acc_row_t) bio.write.bits.mask := { val n = accType.getWidth / 8 - val mask = zero_writer.io.resp.bits.mask + // val mask = zero_writer.io.resp.bits.mask + val mask = zero_writer_pixel_repeater.io.resp.bits.mask val expanded = VecInit(mask.flatMap(e => Seq.fill(n)(e))) expanded } - zero_writer.io.resp.ready := bio.write.ready + // zero_writer.io.resp.ready := bio.write.ready + zero_writer_pixel_repeater.io.resp.ready := bio.write.ready }.otherwise { bio.write.bits.data := DontCare bio.write.bits.mask := DontCare diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala index 05480e09..7cb8c14f 100644 --- a/src/main/scala/gemmini/VectorScalarMultiplier.scala +++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala @@ -9,6 +9,7 @@ class VectorScalarMultiplierReq[T <: Data, U <: Data, Tag <: Data](block_cols: I val in: Vec[T] = Vec(block_cols, t.cloneType) val scale: U = u.cloneType val repeats: UInt = UInt(16.W) // TODO magic number + val pixel_repeats: UInt = UInt(8.W) // TODO magic number val last: Bool = Bool() val tag: Tag = tag_t.cloneType @@ -81,7 +82,6 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data]( in.valid := false.B } - if (num_scale_units == -1) { val pipe = Module(new Pipeline( new VectorScalarMultiplierResp(block_cols, t, tag_t), @@ -144,8 +144,6 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data]( tail_oh := (tail_oh << 1) | tail_oh(nEntries-1) } - - val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new DataWithIndex(t, u))) } for (i <- 0 until nEntries) { for (w <- 0 until width) { @@ -172,7 +170,6 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data]( arbOut.valid := false.B } - val pipe = Module(new ScalePipe(t, mvin_scale_args.get)) pipe.io.in := arbOut val pipe_out = pipe.io.out @@ -187,14 +184,11 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data]( } } } + when (reset.asBool) { regs.foreach(_.valid := false.B) } - - } - - } object VectorScalarMultiplier { diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala index e8581a26..84821d4e 100644 --- a/src/main/scala/gemmini/XactTracker.scala +++ b/src/main/scala/gemmini/XactTracker.scala @@ -15,6 +15,8 @@ class XactTrackerEntry[U <: Data](maxShift: Int, spadWidth: Int, accWidth: Int, val has_acc_bitwidth = Bool() val scale = UInt(mvin_scale_t_bits.W) val repeats = UInt(16.W) // TODO magic number + val pixel_repeats = UInt(8.W) // TODO magic number + val len = UInt(16.W) // TODO magic number val block_stride = UInt(16.W) // TODO magic number val spad_row_offset = UInt(log2Up(spadWidth max accWidth).W) val lg_len_req = UInt(log2Up(log2Up(maxReqBytes+1)+1).W) From 022a306db6ef5c8807102aa67ad28fdbcce46f63 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 4 Dec 2021 23:05:25 -0800 Subject: [PATCH 09/11] * Add default value of pixel-repeats to ReservationStation.scala * Reduce a few bitwidths in ReservationStation.scala --- src/main/scala/gemmini/GemminiConfigs.scala | 3 +++ src/main/scala/gemmini/ReservationStation.scala | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index be1b084f..567ef060 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -163,6 +163,9 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( val load_states = 3 val block_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries)) + val a_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries)) + val c_stride_bits = 16 min (log2Up(acc_banks * acc_bank_entries) max log2Up(sp_banks * sp_bank_entries)) + val pixel_repeats_bits = 8 min log2Up(meshColumns * tileColumns + 1) val hasIm2Col = false diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index 44b992ae..7135969f 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -116,13 +116,13 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G io.busy := !empty && !(solitary_preload && io.solitary_preload) // Config values set by programmer - val a_stride = Reg(UInt(16.W)) // TODO magic numbers - val c_stride = Reg(UInt(16.W)) // TODO magic numbers + val a_stride = Reg(UInt(a_stride_bits.W)) + val c_stride = Reg(UInt(c_stride_bits.W)) val a_transpose = Reg(Bool()) val ld_block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W))) val st_block_stride = block_rows.U val pooling_is_enabled = Reg(Bool()) - val ld_pixel_repeats = Reg(Vec(load_states, UInt(8.W))) // This is the ld_pixel_repeat MINUS ONE // TODO magic numbers + val ld_pixel_repeats = Reg(Vec(load_states, UInt(pixel_repeats_bits.W))) // This is the ld_pixel_repeat MINUS ONE val new_entry = Wire(new Entry) new_entry := DontCare @@ -378,7 +378,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G }.elsewhen(new_entry.is_config && new_entry.q === ldq) { val id = new_entry.cmd.rs1(4,3) // TODO magic numbers val block_stride = new_entry.cmd.rs1(31, 16) // TODO magic numbers - val repeat_pixels = new_entry.cmd.rs1(15, 8) // TODO magic numbers + val repeat_pixels = maxOf(new_entry.cmd.rs1(8 + pixel_repeats_bits - 1, 8), 1.U) // TODO we use a default value of pixel repeats here, for backwards compatibility. However, we should deprecate and remove this default value eventually ld_block_strides(id) := block_stride ld_pixel_repeats(id) := repeat_pixels - 1.U }.elsewhen(new_entry.is_config && new_entry.q === stq) { From 3efa8917a86040228969e7c19e06f814f7ed1587 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Mon, 6 Dec 2021 00:07:07 -0800 Subject: [PATCH 10/11] Fix tlb hit counter (#168) Fixes bug where TLB hits were being counted incorrectly. Prior to this PR, we were using RegNext(io.req.fire()) to match TLB requests to TLB responses. However, we made our interface to the TLB combinational months ago, so the RegNext is no longer necessary (and is actually incorrect). --- src/main/scala/gemmini/FrontendTLB.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala index 269409fc..bc028ee9 100644 --- a/src/main/scala/gemmini/FrontendTLB.scala +++ b/src/main/scala/gemmini/FrontendTLB.scala @@ -66,12 +66,12 @@ class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters: assert(!io.exp.flush_retry || !io.exp.flush_skip, "TLB: flushing with both retry and skip at same time") CounterEventIO.init(io.counter) - io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, RegNext(io.req.fire()) && !tlb.io.resp.miss) + io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, io.req.fire() && !tlb.io.resp.miss) io.counter.connectEventSignal(CounterEvent.DMA_TLB_TOTAL_REQ, io.req.fire()) io.counter.connectEventSignal(CounterEvent.DMA_TLB_MISS_CYCLE, tlb.io.resp.miss) if (use_firesim_simulation_counters) { - PerfCounter(RegNext(io.req.fire()) && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits") + PerfCounter(io.req.fire() && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits") PerfCounter(io.req.fire(), "tlb_reqs", "total number of tlb reqs") PerfCounter(tlb.io.resp.miss, "tlb_miss_cycles", "total number of cycles where the tlb is resolving a miss") } From 56e85a2075a96ca2c14f3e4108a3b3a870bdee46 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Mon, 6 Dec 2021 14:02:45 -0800 Subject: [PATCH 11/11] Rename tiled_conv_A_stride to tiled_conv and cleanup unused conv implementations (#169) Rename tiled_conv_A_stride to tiled_conv and cleanup unused conv implementations --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 5fa954ee..21713ec6 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 5fa954ee9cf97483cd9c765d9f4c664d1701090d +Subproject commit 21713ec6e9dbbf2477b092e04eb8970776a5da72