From 1e4583ac17cb600b74a6d104395759eed1dbb601 Mon Sep 17 00:00:00 2001 From: MichaelJSr Date: Tue, 26 Nov 2024 18:41:01 -0800 Subject: [PATCH 1/6] Adds the riscv vector extension into simx --- ci/regression.sh.in | 16 +- hw/rtl/VX_config.vh | 4 + hw/rtl/VX_types.vh | 13 + perf/cache/cache_perf.log | 2 +- sim/common/rvfloats.cpp | 34 + sim/common/rvfloats.h | 5 + sim/common/softfloat_ext.cpp | 486 ++ sim/common/softfloat_ext.h | 14 + sim/opaesim/Makefile | 2 +- sim/rtlsim/Makefile | 2 +- sim/simx/Makefile | 4 +- sim/simx/arch.h | 6 + sim/simx/decode.cpp | 184 +- sim/simx/emulator.cpp | 75 + sim/simx/emulator.h | 88 +- sim/simx/execute.cpp | 141 +- sim/simx/execute_vector.cpp | 4493 +++++++++++++++++ sim/simx/instr.h | 89 +- sim/simx/types.h | 4 +- sim/xrtsim/Makefile | 2 +- tests/riscv/riscv-vector-tests/README | 39 + tests/riscv/riscv-vector-tests/run-test.sh.in | 117 + 22 files changed, 5716 insertions(+), 104 deletions(-) create mode 100644 sim/common/softfloat_ext.cpp create mode 100644 sim/common/softfloat_ext.h create mode 100644 sim/simx/execute_vector.cpp create mode 100644 tests/riscv/riscv-vector-tests/README create mode 100755 tests/riscv/riscv-vector-tests/run-test.sh.in diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 849a8769f..53819490f 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -386,10 +386,20 @@ synthesis() echo "synthesis tests done!" } +vector() +{ + echo "begin vector tests..." + + make -C sim/simx + TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh + + echo "vector tests done!" +} + show_usage() { echo "Vortex Regression Test" - echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--all] [--h|--help]" + echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]" } declare -a tests=() @@ -439,6 +449,9 @@ while [ "$1" != "" ]; do --synthesis ) tests+=("synthesis") ;; + --vector ) + tests+=("vector") + ;; --all ) tests=() tests+=("unittest") @@ -454,6 +467,7 @@ while [ "$1" != "" ]; do tests+=("scope") tests+=("stress") tests+=("synthesis") + tests+=("vector") ;; -h | --help ) show_usage diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 29eb5c9d8..3badaa3d3 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -87,6 +87,10 @@ `endif `endif +`ifndef VLEN +`define VLEN 256 +`endif + `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 048ba0a5c..4c8505e5e 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -188,6 +188,19 @@ `define VX_CSR_MIMPID 12'hF13 `define VX_CSR_MHARTID 12'hF14 +// Vector CSRs + +`define VX_CSR_VSTART 12'h008 +`define VX_CSR_VXSAT 12'h009 +`define VX_CSR_VXRM 12'h00A +`define VX_CSR_VCSR 12'h00F +`define VX_CSR_VL 12'hC20 +`define VX_CSR_VTYPE 12'hC21 +`define VX_CSR_VLENB 12'hC22 +`define VX_CSR_VCYCLE 12'hC00 +`define VX_CSR_VTIME 12'hC01 +`define VX_CSR_VINSTRET 12'hC02 + // GPGU CSRs `define VX_CSR_THREAD_ID 12'hCC0 diff --git a/perf/cache/cache_perf.log b/perf/cache/cache_perf.log index 21a446d25..0a4a55cc8 100644 --- a/perf/cache/cache_perf.log +++ b/perf/cache/cache_perf.log @@ -1,3 +1,3 @@ CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 make -C ./ci/../driver/rtlsim -verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so +verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/softfloat_ext.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so diff --git a/sim/common/rvfloats.cpp b/sim/common/rvfloats.cpp index 3e577f7f9..2b252010c 100644 --- a/sim/common/rvfloats.cpp +++ b/sim/common/rvfloats.cpp @@ -12,6 +12,7 @@ // limitations under the License. #include "rvfloats.h" +#include "softfloat_ext.h" #include extern "C" { @@ -158,6 +159,34 @@ uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) { return from_float64_t(r); } +uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f32_recip7(to_float32_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float32_t(r); +} + +uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f64_recip7(to_float64_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float64_t(r); +} + +uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f32_rsqrte7(to_float32_t(a)); + if (fflags) { *fflags =softfloat_exceptionFlags; } + return from_float32_t(r); +} + +uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f64_rsqrte7(to_float64_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float64_t(r); +} + uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) { rv_init(frm); auto r = f32_sqrt(to_float32_t(a)); @@ -486,6 +515,11 @@ uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) { return r; } +uint32_t rv_dtof_r(uint64_t a, uint32_t frm) { + rv_init(frm); + return rv_dtof(a); +} + uint32_t rv_dtof(uint64_t a) { auto r = f64_to_f32(to_float64_t(a)); return from_float32_t(r); diff --git a/sim/common/rvfloats.h b/sim/common/rvfloats.h index d921846dd..86b60e8ee 100644 --- a/sim/common/rvfloats.h +++ b/sim/common/rvfloats.h @@ -28,6 +28,8 @@ uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags); uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags); uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags); +uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags); +uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags); uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags); uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags); @@ -58,6 +60,8 @@ uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags); +uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags); +uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags); uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags); uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags); @@ -85,6 +89,7 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags); uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags); uint32_t rv_dtof(uint64_t a); +uint32_t rv_dtof_r(uint64_t a, uint32_t frm); uint64_t rv_ftod(uint32_t a); #ifdef __cplusplus diff --git a/sim/common/softfloat_ext.cpp b/sim/common/softfloat_ext.cpp new file mode 100644 index 000000000..877bdc8ac --- /dev/null +++ b/sim/common/softfloat_ext.cpp @@ -0,0 +1,486 @@ +/*============================================================================ + +This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic +Package, Release 3e, by John R. Hauser. + +Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of +California. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions, and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the University nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=============================================================================*/ + +#include +#include +#include +#include <../RISCV/specialize.h> +#include +#include "softfloat_ext.h" + +uint_fast16_t f16_classify( float16_t a ) +{ + union ui16_f16 uA; + uint_fast16_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF16UI( uiA ) == 0x1F; + uint_fast16_t subnormalOrZero = expF16UI( uiA ) == 0; + bool sign = signF16UI( uiA ); + bool fracZero = fracF16UI( uiA ) == 0; + bool isNaN = isNaNF16UI( uiA ); + bool isSNaN = softfloat_isSigNaNF16UI( uiA ); + + return + ( sign && infOrNaN && fracZero ) << 0 | + ( sign && !infOrNaN && !subnormalOrZero ) << 1 | + ( sign && subnormalOrZero && !fracZero ) << 2 | + ( sign && subnormalOrZero && fracZero ) << 3 | + ( !sign && infOrNaN && fracZero ) << 7 | + ( !sign && !infOrNaN && !subnormalOrZero ) << 6 | + ( !sign && subnormalOrZero && !fracZero ) << 5 | + ( !sign && subnormalOrZero && fracZero ) << 4 | + ( isNaN && isSNaN ) << 8 | + ( isNaN && !isSNaN ) << 9; +} + +uint_fast16_t f32_classify( float32_t a ) +{ + union ui32_f32 uA; + uint_fast32_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF32UI( uiA ) == 0xFF; + uint_fast16_t subnormalOrZero = expF32UI( uiA ) == 0; + bool sign = signF32UI( uiA ); + bool fracZero = fracF32UI( uiA ) == 0; + bool isNaN = isNaNF32UI( uiA ); + bool isSNaN = softfloat_isSigNaNF32UI( uiA ); + + return + ( sign && infOrNaN && fracZero ) << 0 | + ( sign && !infOrNaN && !subnormalOrZero ) << 1 | + ( sign && subnormalOrZero && !fracZero ) << 2 | + ( sign && subnormalOrZero && fracZero ) << 3 | + ( !sign && infOrNaN && fracZero ) << 7 | + ( !sign && !infOrNaN && !subnormalOrZero ) << 6 | + ( !sign && subnormalOrZero && !fracZero ) << 5 | + ( !sign && subnormalOrZero && fracZero ) << 4 | + ( isNaN && isSNaN ) << 8 | + ( isNaN && !isSNaN ) << 9; +} + +uint_fast16_t f64_classify( float64_t a ) +{ + union ui64_f64 uA; + uint_fast64_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF64UI( uiA ) == 0x7FF; + uint_fast16_t subnormalOrZero = expF64UI( uiA ) == 0; + bool sign = signF64UI( uiA ); + bool fracZero = fracF64UI( uiA ) == 0; + bool isNaN = isNaNF64UI( uiA ); + bool isSNaN = softfloat_isSigNaNF64UI( uiA ); + + return + ( sign && infOrNaN && fracZero ) << 0 | + ( sign && !infOrNaN && !subnormalOrZero ) << 1 | + ( sign && subnormalOrZero && !fracZero ) << 2 | + ( sign && subnormalOrZero && fracZero ) << 3 | + ( !sign && infOrNaN && fracZero ) << 7 | + ( !sign && !infOrNaN && !subnormalOrZero ) << 6 | + ( !sign && subnormalOrZero && !fracZero ) << 5 | + ( !sign && subnormalOrZero && fracZero ) << 4 | + ( isNaN && isSNaN ) << 8 | + ( isNaN && !isSNaN ) << 9; +} + +static inline uint64_t extract64(uint64_t val, int pos, int len) +{ + assert(pos >= 0 && len > 0 && len <= 64 - pos); + return (val >> pos) & (~UINT64_C(0) >> (64 - len)); +} + +static inline uint64_t make_mask64(int pos, int len) +{ + assert(pos >= 0 && len > 0 && pos < 64 && len <= 64); + return (UINT64_MAX >> (64 - len)) << pos; +} + +//user needs to truncate output to required length +static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) { + uint64_t exp = extract64(val, s, e); + uint64_t sig = extract64(val, 0, s); + uint64_t sign = extract64(val, s + e, 1); + const int p = 7; + + static const uint8_t table[] = { + 52, 51, 50, 48, 47, 46, 44, 43, + 42, 41, 40, 39, 38, 36, 35, 34, + 33, 32, 31, 30, 30, 29, 28, 27, + 26, 25, 24, 23, 23, 22, 21, 20, + 19, 19, 18, 17, 16, 16, 15, 14, + 14, 13, 12, 12, 11, 10, 10, 9, + 9, 8, 7, 7, 6, 6, 5, 4, + 4, 3, 3, 2, 2, 1, 1, 0, + 127, 125, 123, 121, 119, 118, 116, 114, + 113, 111, 109, 108, 106, 105, 103, 102, + 100, 99, 97, 96, 95, 93, 92, 91, + 90, 88, 87, 86, 85, 84, 83, 82, + 80, 79, 78, 77, 76, 75, 74, 73, + 72, 71, 70, 70, 69, 68, 67, 66, + 65, 64, 63, 63, 62, 61, 60, 59, + 59, 58, 57, 56, 56, 55, 54, 53}; + + if (sub) { + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + + sig = (sig << 1) & make_mask64(0 ,s); + } + + int idx = ((exp & 1) << (p-1)) | (sig >> (s-p+1)); + uint64_t out_sig = (uint64_t)(table[idx]) << (s-p); + uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2; + + return (sign << (s+e)) | (out_exp << s) | out_sig; +} + +float16_t f16_rsqrte7(float16_t in) +{ + union ui16_f16 uA; + + uA.f = in; + unsigned int ret = f16_classify(in); + bool sub = false; + switch(ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF16UI; + break; + case 0x008: // -0 + uA.ui = 0xfc00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7c00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 5, 10, sub); + break; + } + + return uA.f; +} + +float32_t f32_rsqrte7(float32_t in) +{ + union ui32_f32 uA; + + uA.f = in; + unsigned int ret = f32_classify(in); + bool sub = false; + switch(ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF32UI; + break; + case 0x008: // -0 + uA.ui = 0xff800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7f800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 8, 23, sub); + break; + } + + return uA.f; +} + +float64_t f64_rsqrte7(float64_t in) +{ + union ui64_f64 uA; + + uA.f = in; + unsigned int ret = f64_classify(in); + bool sub = false; + switch(ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF64UI; + break; + case 0x008: // -0 + uA.ui = 0xfff0000000000000ul; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7ff0000000000000ul; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 11, 52, sub); + break; + } + + return uA.f; +} + +//user needs to truncate output to required length +static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub, + bool *round_abnormal) +{ + uint64_t exp = extract64(val, s, e); + uint64_t sig = extract64(val, 0, s); + uint64_t sign = extract64(val, s + e, 1); + const int p = 7; + + static const uint8_t table[] = { + 127, 125, 123, 121, 119, 117, 116, 114, + 112, 110, 109, 107, 105, 104, 102, 100, + 99, 97, 96, 94, 93, 91, 90, 88, + 87, 85, 84, 83, 81, 80, 79, 77, + 76, 75, 74, 72, 71, 70, 69, 68, + 66, 65, 64, 63, 62, 61, 60, 59, + 58, 57, 56, 55, 54, 53, 52, 51, + 50, 49, 48, 47, 46, 45, 44, 43, + 42, 41, 40, 40, 39, 38, 37, 36, + 35, 35, 34, 33, 32, 31, 31, 30, + 29, 28, 28, 27, 26, 25, 25, 24, + 23, 23, 22, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 15, 15, 14, 14, + 13, 12, 12, 11, 11, 10, 9, 9, + 8, 8, 7, 7, 6, 5, 5, 4, + 4, 3, 3, 2, 2, 1, 1, 0}; + + if (sub) { + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + + sig = (sig << 1) & make_mask64(0 ,s); + + if (exp != 0 && exp != UINT64_MAX) { + *round_abnormal = true; + if (rm == 1 || + (rm == 2 && !sign) || + (rm == 3 && sign)) + return ((sign << (s+e)) | make_mask64(s, e)) - 1; + else + return (sign << (s+e)) | make_mask64(s, e); + } + } + + int idx = sig >> (s-p); + uint64_t out_sig = (uint64_t)(table[idx]) << (s-p); + uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp; + if (out_exp == 0 || out_exp == UINT64_MAX) { + out_sig = (out_sig >> 1) | make_mask64(s - 1, 1); + if (out_exp == UINT64_MAX) { + out_sig >>= 1; + out_exp = 0; + } + } + + return (sign << (s+e)) | (out_exp << s) | out_sig; +} + +float16_t f16_recip7(float16_t in) +{ + union ui16_f16 uA; + + uA.f = in; + unsigned int ret = f16_classify(in); + bool sub = false; + bool round_abnormal = false; + switch(ret) { + case 0x001: // -inf + uA.ui = 0x8000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xfc00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7c00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF16UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 5, 10, + softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) + softfloat_exceptionFlags |= softfloat_flag_inexact | + softfloat_flag_overflow; + break; + } + + return uA.f; +} + +float32_t f32_recip7(float32_t in) +{ + union ui32_f32 uA; + + uA.f = in; + unsigned int ret = f32_classify(in); + bool sub = false; + bool round_abnormal = false; + switch(ret) { + case 0x001: // -inf + uA.ui = 0x80000000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xff800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7f800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF32UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 8, 23, + softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) + softfloat_exceptionFlags |= softfloat_flag_inexact | + softfloat_flag_overflow; + break; + } + + return uA.f; +} + +float64_t f64_recip7(float64_t in) +{ + union ui64_f64 uA; + + uA.f = in; + unsigned int ret = f64_classify(in); + bool sub = false; + bool round_abnormal = false; + switch(ret) { + case 0x001: // -inf + uA.ui = 0x8000000000000000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xfff0000000000000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7ff0000000000000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: //qNaN + uA.ui = defaultNaNF64UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 11, 52, + softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) + softfloat_exceptionFlags |= softfloat_flag_inexact | + softfloat_flag_overflow; + break; + } + + return uA.f; +} \ No newline at end of file diff --git a/sim/common/softfloat_ext.h b/sim/common/softfloat_ext.h new file mode 100644 index 000000000..7a18af9f7 --- /dev/null +++ b/sim/common/softfloat_ext.h @@ -0,0 +1,14 @@ +#include +#include + +uint_fast16_t f16_classify( float16_t ); +float16_t f16_rsqrte7( float16_t ); +float16_t f16_recip7( float16_t ); + +uint_fast16_t f32_classify( float32_t ); +float32_t f32_rsqrte7( float32_t ); +float32_t f32_recip7( float32_t ); + +uint_fast16_t f64_classify( float64_t ); +float64_t f64_rsqrte7( float64_t ); +float64_t f64_recip7( float64_t ); \ No newline at end of file diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index b04f8ddb4..49b0f4ab8 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -51,7 +51,7 @@ endif DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(SRC_DIR)/fpga.cpp $(SRC_DIR)/opae_sim.cpp diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index ecaee717b..3903bbd85 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -35,7 +35,7 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) endif RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(SRC_DIR)/processor.cpp diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 31fde7023..b97e9c00f 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -17,8 +17,8 @@ CXXFLAGS += $(CONFIGS) LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp -SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/execute_vector.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp # Debugging ifdef DEBUG diff --git a/sim/simx/arch.h b/sim/simx/arch.h index 6becf5c91..d68345db6 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -29,6 +29,7 @@ class Arch { uint16_t num_cores_; uint16_t num_clusters_; uint16_t socket_size_; + uint16_t vsize_; uint16_t num_barriers_; uint64_t local_mem_base_; @@ -39,6 +40,7 @@ class Arch { , num_cores_(num_cores) , num_clusters_(NUM_CLUSTERS) , socket_size_(SOCKET_SIZE) + , vsize_(VLEN / 8) , num_barriers_(NUM_BARRIERS) , local_mem_base_(LMEM_BASE_ADDR) {} @@ -71,6 +73,10 @@ class Arch { return socket_size_; } + uint16_t vsize() const { + return vsize_; + } + }; } \ No newline at end of file diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index 7a37e79e2..3c184879d 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -47,6 +47,7 @@ static const std::unordered_map sc_instTable = { {Opcode::FMSUB, InstType::R4}, {Opcode::FMNMADD, InstType::R4}, {Opcode::FMNMSUB, InstType::R4}, + {Opcode::VSET, InstType::V}, {Opcode::EXT1, InstType::R}, {Opcode::EXT2, InstType::R4}, {Opcode::R_W, InstType::R}, @@ -54,33 +55,6 @@ static const std::unordered_map sc_instTable = { {Opcode::TCU, InstType::I}, }; -enum Constants { - width_opcode= 7, - width_reg = 5, - width_func2 = 2, - width_func3 = 3, - width_func7 = 7, - width_i_imm = 12, - width_j_imm = 20, - - shift_opcode= 0, - shift_rd = width_opcode, - shift_func3 = shift_rd + width_reg, - shift_rs1 = shift_func3 + width_func3, - shift_rs2 = shift_rs1 + width_reg, - shift_func2 = shift_rs2 + width_reg, - shift_func7 = shift_rs2 + width_reg, - shift_rs3 = shift_func7 + width_func2, - - mask_opcode = (1 << width_opcode) - 1, - mask_reg = (1 << width_reg) - 1, - mask_func2 = (1 << width_func2) - 1, - mask_func3 = (1 << width_func3) - 1, - mask_func7 = (1 << width_func7) - 1, - mask_i_imm = (1 << width_i_imm) - 1, - mask_j_imm = (1 << width_j_imm) - 1, -}; - static const char* op_string(const Instr &instr) { auto opcode = instr.getOpcode(); auto func2 = instr.getFunc2(); @@ -230,10 +204,14 @@ static const char* op_string(const Instr &instr) { case Opcode::FENCE: return "FENCE"; case Opcode::FL: switch (func3) { - case 0x1: return "VL"; case 0x2: return "FLW"; case 0x3: return "FLD"; + case 0x0: return "VL8"; + case 0x5: return "VL16"; + case 0x6: return "VL32"; + case 0x7: return "VL64"; default: + std::cout << "Could not decode float/vector load with func3: " << func3 << std::endl; std::abort(); } case Opcode::FS: @@ -241,7 +219,12 @@ static const char* op_string(const Instr &instr) { case 0x1: return "VS"; case 0x2: return "FSW"; case 0x3: return "FSD"; + case 0x0: return "VS8"; + case 0x5: return "VS16"; + case 0x6: return "VS32"; + case 0x7: return "VS64"; default: + std::cout << "Could not decode float/vector store with func3: " << func3 << std::endl; std::abort(); } case Opcode::AMO: { @@ -390,6 +373,7 @@ static const char* op_string(const Instr &instr) { case Opcode::FMSUB: return func2 ? "FMSUB.D" : "FMSUB.S"; case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S"; case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S"; + case Opcode::VSET: return "VSET"; case Opcode::EXT1: switch (func7) { case 0: @@ -421,6 +405,39 @@ static const char* op_string(const Instr &instr) { } } +inline void vec_log(std::ostream &os, const Instr &instr) { + if (instr.getVUseMask() & set_func3) + os << ", func3:" << instr.getFunc3(); + if (instr.getVUseMask() & set_func6) + os << ", func6:" << instr.getFunc6(); + if (instr.getVUseMask() & set_imm) + os << ", imm:" << instr.getImm(); + if (instr.getVUseMask() & set_vlswidth) + os << ", width:" << instr.getVlsWidth(); + if (instr.getVUseMask() & set_vmop) + os << ", mop:" << instr.getVmop(); + if (instr.getVUseMask() & set_vumop) + os << ", umop:" << instr.getVumop(); + if (instr.getVUseMask() & set_vnf) + os << ", nf:" << instr.getVnf(); + if (instr.getVUseMask() & set_vmask) + os << ", vmask:" << instr.getVmask(); + if (instr.getVUseMask() & set_vs3) + os << ", vs3:" << instr.getVs3(); + if (instr.getVUseMask() & set_zimm) + os << ", zimm:" << ((instr.hasZimm()) ? "true" : "false"); + if (instr.getVUseMask() & set_vlmul) + os << ", lmul:" << instr.getVlmul(); + if (instr.getVUseMask() & set_vsew) + os << ", sew:" << instr.getVsew(); + if (instr.getVUseMask() & set_vta) + os << ", ta:" << instr.getVta(); + if (instr.getVUseMask() & set_vma) + os << ", ma:" << instr.getVma(); + if (instr.getVUseMask() & set_vediv) + os << ", ediv:" << instr.getVediv(); +} + namespace vortex { std::ostream &operator<<(std::ostream &os, const Instr &instr) { os << op_string(instr); @@ -441,6 +458,13 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { if (sep++ != 0) { os << ", "; } else { os << " "; } os << "0x" << std::hex << instr.getImm() << std::dec; } + if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) { + // CSRs with immediate values + if (sep++ != 0) { os << ", "; } else { os << " "; } + os << "0x" << std::hex << instr.getRSrc(0); + } + // Log vector-specific vtype and vreg info + if (instr.isVec()) vec_log(os, instr); return os; } } @@ -452,6 +476,7 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto func2 = (code >> shift_func2) & mask_func2; auto func3 = (code >> shift_func3) & mask_func3; + auto func6 = (code >> shift_func6) & mask_func6; auto func7 = (code >> shift_func7) & mask_func7; auto rd = (code >> shift_rd) & mask_reg; @@ -466,6 +491,12 @@ std::shared_ptr Emulator::decode(uint32_t code) const { } auto iType = op_it->second; + if (op == Opcode::FL || op == Opcode::FS) { + if (func3 != 0x2 && func3 != 0x3) { + iType = InstType::V; + } + } + switch (iType) { case InstType::R: switch (op) { @@ -659,7 +690,104 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto imm = (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); instr->setImm(sext(imm, width_j_imm+1)); } break; + + case InstType::V: + instr->setVec(true); + switch (op) { + case Opcode::VSET: { + instr->setDestReg(rd, RegType::Integer); + instr->setFunc3(func3); + switch (func3) { + case 7: { + if (code >> (shift_vset - 1) == 0b10) { // vsetvl + instr->addSrcReg(rs1, RegType::Integer); + instr->addSrcReg(rs2, RegType::Integer); + } else { + auto zimm = (code >> shift_rs2) & mask_v_zimm; + instr->setZimm(true); + instr->setVlmul(zimm & mask_v_lmul); + instr->setVsew((zimm >> shift_v_sew) & mask_v_sew); + instr->setVta((zimm >> shift_v_ta) & mask_v_ta); + instr->setVma((zimm >> shift_v_ma) & mask_v_ma); + if ((code >> shift_vset)) { // vsetivli + instr->setImm(rs1); + } else { // vsetvli + instr->addSrcReg(rs1, RegType::Integer); + } + } + } break; + case 3: { // Vector - immediate arithmetic instructions + instr->setDestReg(rd, RegType::Vector); + instr->addSrcReg(rs2, RegType::Vector); + instr->setImm(rs1); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setFunc6(func6); + } break; + default: { // Vector - vector/scalar arithmetic instructions + if (func3 == 1 && func6 == 16) { + instr->setDestReg(rd, RegType::Float); + } else if (func3 == 2 && func6 == 16) { + instr->setDestReg(rd, RegType::Integer); + } else { + instr->setDestReg(rd, RegType::Vector); + } + instr->addSrcReg(rs1, RegType::Vector); + instr->addSrcReg(rs2, RegType::Vector); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setFunc6(func6); + } + } + } break; + + case Opcode::FL: + instr->addSrcReg(rs1, RegType::Integer); + instr->setVmop((code >> shift_vmop) & 0b11); + switch (instr->getVmop()) { + case 0b00: + instr->setVumop(rs2); + break; + case 0b10: + instr->addSrcReg(rs2, RegType::Integer); + break; + case 0b01: + case 0b11: + instr->addSrcReg(rs2, RegType::Vector); + break; + } + instr->setVsew(func3 & 0x3); + instr->setDestReg(rd, RegType::Vector); + instr->setVlsWidth(func3); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setVnf((code >> shift_vnf) & mask_func3); + break; + case Opcode::FS: + instr->addSrcReg(rs1, RegType::Integer); + instr->setVmop((code >> shift_vmop) & 0b11); + switch (instr->getVmop()) { + case 0b00: + instr->setVumop(rs2); + break; + case 0b10: + instr->addSrcReg(rs2, RegType::Integer); + break; + case 0b01: + case 0b11: + instr->addSrcReg(rs2, RegType::Vector); + break; + } + instr->setVsew(func3 & 0x3); + instr->addSrcReg(rd, RegType::Vector); + instr->setVlsWidth(func3); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setVmop((code >> shift_vmop) & 0b11); + instr->setVnf((code >> shift_vnf) & mask_func3); + break; + + default: + std::abort(); + } + break; case InstType::R4: instr->setDestReg(rd, RegType::Float); instr->addSrcReg(rs1, RegType::Float); diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 05b3497c4..14cb979d4 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -33,6 +33,7 @@ using namespace vortex; Emulator::warp_t::warp_t(const Arch& arch) : ireg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) , freg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) + , vreg_file(MAX_NUM_REGS, std::vector(arch.vsize())) , uuid(0) {} @@ -64,6 +65,26 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { #endif } } + + for (auto& reg_file : this->vreg_file) { + for (auto& reg : reg_file) { + #ifndef NDEBUG + reg = 0; + #else + reg = std::rand(); + #endif + } + } + + for (auto& reg_file : this->vreg_file) { + for (auto& reg : reg_file) { + #ifndef NDEBUG + reg = 0; + #else + reg = std::rand(); + #endif + } + } } /////////////////////////////////////////////////////////////////////////////// @@ -79,7 +100,12 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) // considered to be big enough to hold input tiles for one output tile. // In future versions, scratchpad size should be fixed to an appropriate value. , scratchpad(std::vector(32 * 32 * 32768)) + , csrs_(arch.num_warps()) { + for (uint32_t i = 0; i < arch_.num_warps(); ++i) { + csrs_.at(i).resize(arch.num_threads()); + } + this->clear(); } @@ -463,6 +489,32 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F; case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5); case VX_CSR_FCSR: return warps_.at(wid).fcsr; + + // Vector CRSs + case VX_CSR_VSTART: + return csrs_.at(wid).at(tid)[VX_CSR_VSTART]; + case VX_CSR_VXSAT: + return csrs_.at(wid).at(tid)[VX_CSR_VXSAT]; + case VX_CSR_VXRM: + return csrs_.at(wid).at(tid)[VX_CSR_VXRM]; + case VX_CSR_VCSR: { + Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT]; + Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM]; + return (vxrm << 1) | vxsat; + } + case VX_CSR_VL: + return csrs_.at(wid).at(tid)[VX_CSR_VL]; + case VX_CSR_VTYPE: + return csrs_.at(wid).at(tid)[VX_CSR_VTYPE]; + case VX_CSR_VLENB: + return VLEN / 8; + case VX_CSR_VCYCLE: + return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE]; + case VX_CSR_VTIME: + return csrs_.at(wid).at(tid)[VX_CSR_VTIME]; + case VX_CSR_VINSTRET: + return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET]; + case VX_CSR_MHARTID: return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid; case VX_CSR_THREAD_ID: return tid; case VX_CSR_WARP_ID: return wid; @@ -578,6 +630,29 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MSCRATCH: csr_mscratch_ = value; break; + + // Vector CRSs + case VX_CSR_VSTART: + csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value; + break; + case VX_CSR_VXSAT: + csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1; + break; + case VX_CSR_VXRM: + csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11; + break; + case VX_CSR_VCSR: + csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1; + csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11; + break; + case VX_CSR_VL: // read only, written by vset(i)vl(i) + csrs_.at(wid).at(tid)[VX_CSR_VL] = value; + break; + case VX_CSR_VTYPE: // read only, written by vset(i)vl(i) + csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value; + break; + case VX_CSR_VLENB: // read only, set to VLEN / 8 + case VX_CSR_SATP: #ifdef VM_ENABLE // warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F); diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 5f1b91d5d..ffe630c3d 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -28,6 +28,76 @@ class Core; class Instr; class instr_trace_t; +enum Constants { + width_opcode= 7, + width_reg = 5, + width_func2 = 2, + width_func3 = 3, + width_func6 = 6, + width_func7 = 7, + width_mop = 3, + width_vmask = 1, + width_i_imm = 12, + width_j_imm = 20, + width_v_zimm = 11, + width_v_ma = 1, + width_v_ta = 1, + width_v_sew = 3, + width_v_lmul = 3, + width_aq = 1, + width_rl = 1, + + shift_opcode= 0, + shift_rd = width_opcode, + shift_func3 = shift_rd + width_reg, + shift_rs1 = shift_func3 + width_func3, + shift_rs2 = shift_rs1 + width_reg, + shift_func2 = shift_rs2 + width_reg, + shift_func7 = shift_rs2 + width_reg, + shift_rs3 = shift_func7 + width_func2, + shift_vmop = shift_func7 + width_vmask, + shift_vnf = shift_vmop + width_mop, + shift_func6 = shift_func7 + width_vmask, + shift_vset = shift_func7 + width_func6, + shift_v_sew = width_v_lmul, + shift_v_ta = shift_v_sew + width_v_sew, + shift_v_ma = shift_v_ta + width_v_ta, + + mask_opcode = (1 << width_opcode) - 1, + mask_reg = (1 << width_reg) - 1, + mask_func2 = (1 << width_func2) - 1, + mask_func3 = (1 << width_func3) - 1, + mask_func6 = (1 << width_func6) - 1, + mask_func7 = (1 << width_func7) - 1, + mask_i_imm = (1 << width_i_imm) - 1, + mask_j_imm = (1 << width_j_imm) - 1, + mask_v_zimm = (1 << width_v_zimm) - 1, + mask_v_ma = (1 << width_v_ma) - 1, + mask_v_ta = (1 << width_v_ta) - 1, + mask_v_sew = (1 << width_v_sew) - 1, + mask_v_lmul = (1 << width_v_lmul) - 1, +}; + +struct vtype { + uint32_t vill; + uint32_t vma; + uint32_t vta; + uint32_t vsew; + uint32_t vlmul; +}; + +union reg_data_t { + Word u; + WordI i; + WordF f; + float f32; + double f64; + uint32_t u32; + uint64_t u64; + int32_t i32; + int64_t i64; +}; + class Emulator { public: Emulator(const Arch &arch, @@ -61,6 +131,10 @@ class Emulator { Word get_tc_size(); Word get_tc_num(); + void dcache_read(void* data, uint64_t addr, uint32_t size); + + void dcache_write(const void* data, uint64_t addr, uint32_t size); + private: struct ipdom_entry_t { @@ -85,9 +159,14 @@ class Emulator { ThreadMask tmask; std::vector> ireg_file; std::vector>freg_file; + std::vector> vreg_file; std::stack ipdom_stack; Byte fcsr; uint32_t uuid; + + struct vtype vtype; + uint32_t vl; + Word VLMAX; }; struct wspawn_t { @@ -100,11 +179,13 @@ class Emulator { void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace); - void icache_read(void* data, uint64_t addr, uint32_t size); + void executeVector(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata); - void dcache_read(void* data, uint64_t addr, uint32_t size); + void loadVector(const Instr &instr, uint32_t wid, std::vector &rsdata); - void dcache_write(const void* data, uint64_t addr, uint32_t size); + void storeVector(const Instr &instr, uint32_t wid, std::vector &rsdata); + + void icache_read(void* data, uint64_t addr, uint32_t size); void dcache_amo_reserve(uint64_t addr); @@ -142,6 +223,7 @@ class Emulator { uint32_t mat_size; uint32_t tc_size; uint32_t tc_num; + std::vector>> csrs_; }; } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index dd8253571..d477a1d45 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -25,22 +25,11 @@ #include "emulator.h" #include "instr.h" #include "core.h" +#include "processor_impl.h" #include "VX_types.h" using namespace vortex; -union reg_data_t { - Word u; - WordI i; - WordF f; - float f32; - double f64; - uint32_t u32; - uint64_t u64; - int32_t i32; - int64_t i64; -}; - inline uint64_t nan_box(uint32_t value) { return value | 0xffffffff00000000; } @@ -128,6 +117,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } DPN(2, "}" << std::endl); break; + case RegType::Vector: + break; default: break; } @@ -678,41 +669,47 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->src_regs[0] = {RegType::Integer, rsrc0}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t data_bytes = 1 << (func3 & 0x3); - uint32_t data_width = 8 * data_bytes; - for (uint32_t t = thread_start; t < num_threads; ++t) { - if (!warp.tmask.test(t)) - continue; - uint64_t mem_addr = rsdata[t][0].i + immsrc; - uint64_t read_data = 0; - this->dcache_read(&read_data, mem_addr, data_bytes); - trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; - switch (func3) { - case 0: // RV32I: LB - case 1: // RV32I: LH - rddata[t].i = sext((Word)read_data, data_width); - break; - case 2: - if (opcode == Opcode::L) { - // RV32I: LW + if ((opcode == Opcode::L ) + || (opcode == Opcode::FL && func3 == 2) + || (opcode == Opcode::FL && func3 == 3)) { + uint32_t data_bytes = 1 << (func3 & 0x3); + uint32_t data_width = 8 * data_bytes; + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint64_t mem_addr = rsdata[t][0].i + immsrc; + uint64_t read_data = 0; + this->dcache_read(&read_data, mem_addr, data_bytes); + trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; + switch (func3) { + case 0: // RV32I: LB + case 1: // RV32I: LH rddata[t].i = sext((Word)read_data, data_width); - } else { - // RV32F: FLW - rddata[t].u64 = nan_box((uint32_t)read_data); + break; + case 2: + if (opcode == Opcode::L) { + // RV32I: LW + rddata[t].i = sext((Word)read_data, data_width); + } else { + // RV32F: FLW + rddata[t].u64 = nan_box((uint32_t)read_data); + } + break; + case 3: // RV64I: LD + // RV32D: FLD + case 4: // RV32I: LBU + case 5: // RV32I: LHU + case 6: // RV64I: LWU + rddata[t].u64 = read_data; + break; + default: + std::abort(); } - break; - case 3: // RV64I: LD - // RV32D: FLD - case 4: // RV32I: LBU - case 5: // RV32I: LHU - case 6: // RV64I: LWU - rddata[t].u64 = read_data; - break; - default: - std::abort(); } + rd_write = true; + } else { + loadVector(instr, wid, rsdata); } - rd_write = true; break; } case Opcode::S: @@ -724,23 +721,29 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->src_regs[1] = {data_type, rsrc1}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t data_bytes = 1 << (func3 & 0x3); - for (uint32_t t = thread_start; t < num_threads; ++t) { - if (!warp.tmask.test(t)) - continue; - uint64_t mem_addr = rsdata[t][0].i + immsrc; - uint64_t write_data = rsdata[t][1].u64; - trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; - switch (func3) { - case 0: - case 1: - case 2: - case 3: - this->dcache_write(&write_data, mem_addr, data_bytes); - break; - default: - std::abort(); + if ((opcode == Opcode::S) + || (opcode == Opcode::FS && func3 == 2) + || (opcode == Opcode::FS && func3 == 3)) { + uint32_t data_bytes = 1 << (func3 & 0x3); + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint64_t mem_addr = rsdata[t][0].i + immsrc; + uint64_t write_data = rsdata[t][1].u64; + trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; + switch (func3) { + case 0: + case 1: + case 2: + case 3: + this->dcache_write(&write_data, mem_addr, data_bytes); + break; + default: + std::abort(); + } } + } else { + storeVector(instr, wid, rsdata); } break; } @@ -925,7 +928,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { for (uint32_t t = thread_start; t < num_threads; ++t) { if (!warp.tmask.test(t)) continue; - uint32_t frm = this->get_fpu_rm(func3, t, wid); + uint32_t frm = (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, t, wid) : func3; uint32_t fflags = 0; switch (func7) { case 0x00: { // RV32F: FADD.S @@ -1240,7 +1243,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { break; } } - this->update_fcrs(fflags, t, wid); + if (fflags) { + this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid); + this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid); + } } rd_write = true; break; @@ -1294,7 +1300,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { default: break; } - this->update_fcrs(fflags, t, wid); + if (fflags) { + this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid); + this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid); + } } rd_write = true; break; @@ -1586,6 +1595,13 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { std::abort(); } } break; + case Opcode::VSET: { + auto func6 = instr.getFunc6(); + if ((func3 == 0x7) || (func3 == 0x2 && func6 == 16) || (func3 == 0x1 && func6 == 16)) { + rd_write = true; + } + executeVector(instr, wid, rsdata, rddata); + } break; default: std::abort(); } @@ -1629,6 +1645,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->dst_reg = {type, rdest}; break; default: + std::cout << "Unrecognized register write back type: " << type << std::endl; std::abort(); break; } diff --git a/sim/simx/execute_vector.cpp b/sim/simx/execute_vector.cpp new file mode 100644 index 000000000..3b2d585db --- /dev/null +++ b/sim/simx/execute_vector.cpp @@ -0,0 +1,4493 @@ +// This is a fork of https://github.com/troibe/vortex/tree/simx-v2-vector +// The purpose of this fork is to make the simx-v2-vector up to date with master +// Thanks to Troibe for his amazing work + +#include +#include +#include +#include +#include +#include "emulator.h" +#include "instr.h" +#include "processor_impl.h" + +using namespace vortex; + +template +class Add { + public: + static R apply(T first, T second, R) { + return (R)first + (R)second; + } + static std::string name() {return "Add";} +}; + +template +class Sub { + public: + static R apply(T first, T second, R) { + return (R)second - (R)first; + } + static std::string name() {return "Sub";} +}; + +template +class Adc { + public: + static R apply(T first, T second, R third) { + return (R)first + (R)second + third; + } + static std::string name() {return "Adc";} +}; + +template +class Madc { + public: + static R apply(T first, T second, R third) { + return (R)first + (R)second + third > (R)std::numeric_limits::max(); + } + static std::string name() {return "Madc";} +}; + +template +class Sbc { + public: + static R apply(T first, T second, R third) { + return (R)second - (R)first - third; + } + static std::string name() {return "Sbc";} +}; + +template +class Msbc { + public: + static R apply(T first, T second, R third) { + return (R)second < (R)first + third; + } + static std::string name() {return "Msbc";} +}; + +template +class Ssub { + public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + T unclippedResult = second - first; + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Ssub";} +}; + +template +class Ssubu { + public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + if (first > second) { + vxsat_ = true; + return 0; + } else { + vxsat_ = false; + return second - first; + } + } + static std::string name() {return "Ssubu";} +}; + +template +class Sadd { + public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + T unclippedResult = second + first; + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Sadd";} +}; + +template +class Rsub { + public: + static R apply(T first, T second, R) { + return first - second; + } + static std::string name() {return "Rsub";} +}; + +template +class Div { + public: + static R apply(T first, T second, R) { + // logic taken from scalar div + if (first == 0) { + return -1; + } else if (second == std::numeric_limits::min() && first == T(-1)) { + return second; + } else { + return (R)second / (R)first; + } + } + static std::string name() {return "Div";} +}; + +template +class Rem { + public: + static R apply(T first, T second, R) { + // logic taken from scalar rem + if (first == 0) { + return second; + } else if (second == std::numeric_limits::min() && first == T(-1)) { + return 0; + } else { + return (R)second % (R)first; + } + } + static std::string name() {return "Rem";} +}; + +template +class Mul { + public: + static R apply(T first, T second, R) { + return (R)first * (R)second; + } + static std::string name() {return "Mul";} +}; + +template +class Mulsu { + public: + static R apply(T first, T second, R) { + R first_ext = zext((R)first, (sizeof(T) * 8)); + return first_ext * (R)second; + } + static std::string name() {return "Mulsu";} +}; + +template +class Mulh { + public: + static R apply(T first, T second, R) { + __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8)); + __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8)); + return (first_ext * second_ext) >> (sizeof(T) * 8); + } + static std::string name() {return "Mulh";} +}; + +template +class Mulhsu { + public: + static R apply(T first, T second, R) { + __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8)); + __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8)); + return (first_ext * second_ext) >> (sizeof(T) * 8); + } + static std::string name() {return "Mulhsu";} +}; + +template +class Mulhu { + public: + static R apply(T first, T second, R) { + return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8); + } + static std::string name() {return "Mulhu";} +}; + +template +class Madd { + public: + static R apply(T first, T second, R third) { + return ((R)first * third) + (R)second; + } + static std::string name() {return "Madd";} +}; + +template +class Nmsac { + public: + static R apply(T first, T second, R third) { + return -((R)first * (R)second) + third; + } + static std::string name() {return "Nmsac";} +}; + +template +class Macc { + public: + static R apply(T first, T second, R third) { + return ((R)first * (R)second) + third; + } + static std::string name() {return "Macc";} +}; + +template +class Maccsu { + public: + static R apply(T first, T second, R third) { + R first_ext = sext((R)first, (sizeof(T) * 8)); + R second_ext = zext((R)second, (sizeof(T) * 8)); + return (first_ext * second_ext) + third; + } + static std::string name() {return "Maccsu";} +}; + +template +class Maccus { + public: + static R apply(T first, T second, R third) { + R first_ext = zext((R)first, (sizeof(T) * 8)); + R second_ext = sext((R)second, (sizeof(T) * 8)); + return (first_ext * second_ext) + third; + } + static std::string name() {return "Maccus";} +}; + +template +class Nmsub { + public: + static R apply(T first, T second, R third) { + return -((R)first * third) + (R)second; + } + static std::string name() {return "Nmsub";} +}; + +template +class Min { + public: + static R apply(T first, T second, R) { + return std::min(first, second); + } + static std::string name() {return "Min";} +}; + +template +class Max { + public: + static R apply(T first, T second, R) { + return std::max(first, second); + } + static std::string name() {return "Max";} +}; + +template +class And { + public: + static R apply(T first, T second, R) { + return first & second; + } + static std::string name() {return "And";} +}; + +template +class Or { + public: + static R apply(T first, T second, R) { + return first | second; + } + static std::string name() {return "Or";} +}; + +template +class Xor { + public: + static R apply(T first, T second, R) { + return first ^ second; + } + static std::string name() {return "Xor";} +}; + +template +class Sll { + public: + static R apply(T first, T second, R) { + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + return second << (first & (sizeof(T) * 8 - 1)); + } + static std::string name() {return "Sll";} +}; + +template +bool bitAt(T value, R pos, R negOffset) { + R offsetPos = pos - negOffset; + return pos >= negOffset && ((value >> offsetPos) & 0x1); +} + +template +bool anyBitUpTo(T value, R to, R negOffset) { + R offsetTo = to - negOffset; + return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1)); +} + +template +bool roundBit(T value, R shiftDown, uint32_t vxrm) { + switch (vxrm){ + case 0: // round-to-nearest-up + return bitAt(value, shiftDown, (R)1); + case 1: // round-to-nearest-even + return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0)); + case 2: // round-down (truncate) + return 0; + case 3: // round-to-odd + return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1); + default: + std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl; + std::abort(); + } +} + +template +class SrlSra { + public: + static R apply(T first, T second, R) { + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + return second >> (first & (sizeof(T) * 8 - 1)); + } + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + T firstValid = first & (sizeof(T) * 8 - 1); + return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm); + } + static std::string name() {return "SrlSra";} +}; + +template +class Aadd { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + T sum = second + first; + return (sum >> 1) + roundBit(sum, 1, vxrm); + } + static std::string name() {return "Aadd";} +}; + +template +class Asub { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + T difference = second - first; + return (difference >> 1) + roundBit(difference, 1, vxrm); + } + static std::string name() {return "Asub";} +}; + +template +class Eq { + public: + static R apply(T first, T second, R) { + return first == second; + } + static std::string name() {return "Eq";} +}; + +template +class Ne { + public: + static R apply(T first, T second, R) { + return first != second; + } + static std::string name() {return "Ne";} +}; + +template +class Lt { + public: + static R apply(T first, T second, R) { + return first > second; + } + static std::string name() {return "Lt";} +}; + +template +class Le { + public: + static R apply(T first, T second, R) { + return first >= second; + } + static std::string name() {return "Le";} +}; + +template +class Gt { + public: + static R apply(T first, T second, R) { + return first < second; + } + static std::string name() {return "Gt";} +}; + +template +class AndNot { + public: + static R apply(T first, T second, R) { + return second & ~first; + } + static std::string name() {return "AndNot";} +}; + +template +class OrNot { + public: + static R apply(T first, T second, R) { + return second | ~first; + } + static std::string name() {return "OrNot";} +}; + +template +class Nand { + public: + static R apply(T first, T second, R) { + return ~(second & first); + } + static std::string name() {return "Nand";} +}; + +template +class Mv { + public: + static R apply(T first, T, R) { + return first; + } + static std::string name() {return "Mv";} +}; + +template +class Nor { + public: + static R apply(T first, T second, R) { + return ~(second | first); + } + static std::string name() {return "Nor";} +}; + +template +class Xnor { + public: + static R apply(T first, T second, R) { + return ~(second ^ first); + } + static std::string name() {return "Xnor";} +}; + +template +class Fadd { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fadd_s(first, second, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fadd_d(first_d, second_d, frm, &fflags); + } else { + std::cout << "Fadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fadd";} +}; + +template +class Fsub { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fsub_s(second, first, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fsub_d(second_d, first_d, frm, &fflags); + } else { + std::cout << "Fsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsub";} +}; + +template +class Fmacc { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmadd_s(first, second, third, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmadd_d(first_d, second_d, third, frm, &fflags); + } else { + std::cout << "Fmacc only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmacc";} +}; + +template +class Fnmacc { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fnmadd_s(first, second, third, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fnmadd_d(first_d, second_d, third, frm, &fflags); + } else { + std::cout << "Fnmacc only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmacc";} +}; + +template +class Fmsac { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags); + } else { + std::cout << "Fmsac only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmsac";} +}; + +template +class Fnmsac { + public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags); + } else { + std::cout << "Fnmsac only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmsac";} +}; + +template +class Fmadd { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fmacc::apply(first, third, second); + } else { + std::cout << "Fmadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmadd";} +}; + +template +class Fnmadd { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fnmacc::apply(first, third, second); + } else { + std::cout << "Fnmadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmadd";} +}; + +template +class Fmsub { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fmsac::apply(first, third, second); + } else { + std::cout << "Fmsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmsub";} +}; + +template +class Fnmsub { + public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fnmsac::apply(first, third, second); + } else { + std::cout << "Fnmsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fnmsub";} +}; + +template +class Fmin { + public: + static R apply(T first, T second, R) { + // ignoring rounding modes for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fmin_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fmin_d(first, second, &fflags); + } else { + std::cout << "Fmin only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmin";} +}; + +template +class Fmax { + public: + static R apply(T first, T second, R) { + // ignoring rounding modes for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fmax_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fmax_d(first, second, &fflags); + } else { + std::cout << "Fmax only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmax";} +}; + +template +class Fsgnj { + public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnj_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnj_d(second, first); + } else { + std::cout << "Fsgnj only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsgnj";} +}; + +template +class Fsgnjn { + public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnjn_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnjn_d(second, first); + } else { + std::cout << "Fsgnjn only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsgnjn";} +}; + +template +class Fsgnjx { + public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnjx_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnjx_d(second, first); + } else { + std::cout << "Fsgnjx only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fsgnjx";} +}; + +template +class Fcvt { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + switch (first) { + case 0b00000: // vfcvt.xu.f.v + return rv_ftou_s(second, frm, &fflags); + case 0b00001: // vfcvt.x.f.v + return rv_ftoi_s(second, frm, &fflags); + case 0b00010: // vfcvt.f.xu.v + return rv_utof_s(second, frm, &fflags); + case 0b00011: // vfcvt.f.x.v + return rv_itof_s(second, frm, &fflags); + case 0b00110: // vfcvt.rtz.xu.f.v + return rv_ftou_s(second, 1, &fflags); + case 0b00111: // vfcvt.rtz.x.f.v + return rv_ftoi_s(second, 1, &fflags); + case 0b01000: // vfwcvt.xu.f.v + return rv_ftolu_s(second, frm, &fflags); + case 0b01001: // vfwcvt.x.f.v + return rv_ftol_s(second, frm, &fflags); + case 0b01010: // vfwcvt.f.xu.v + return rv_utof_d(second, frm, &fflags); + case 0b01011: // vfwcvt.f.x.v + return rv_itof_d(second, frm, &fflags); + case 0b01100: // vfwcvt.f.f.v + return rv_ftod(second); + case 0b01110: // vfwcvt.rtz.xu.f.v + return rv_ftolu_s(second, 1, &fflags); + case 0b01111: // vfwcvt.rtz.x.f.v + return rv_ftol_s(second, 1, &fflags); + default: + std::cout << "Fcvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else if (sizeof(T) == 8) { + switch (first) { + case 0b00000: // vfcvt.xu.f.v + return rv_ftolu_d(second, frm, &fflags); + case 0b00001: // vfcvt.x.f.v + return rv_ftol_d(second, frm, &fflags); + case 0b00010: // vfcvt.f.xu.v + return rv_lutof_d(second, frm, &fflags); + case 0b00011: // vfcvt.f.x.v + return rv_ltof_d(second, frm, &fflags); + case 0b00110: // vfcvt.rtz.xu.f.v + return rv_ftolu_d(second, 1, &fflags); + case 0b00111: // vfcvt.rtz.x.f.v + return rv_ftol_d(second, 1, &fflags); + case 0b01000: // vfwcvt.xu.f.v + case 0b01001: // vfwcvt.x.f.v + case 0b01010: // vfwcvt.f.xu.v + case 0b01011: // vfwcvt.f.x.v + case 0b01100: // vfwcvt.f.f.v + case 0b01110: // vfwcvt.rtz.xu.f.v + case 0b01111: // vfwcvt.rtz.x.f.v + std::cout << "Fwcvt only supports f32" << std::endl; + std::abort(); + default: + std::cout << "Fcvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Fcvt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 8) { + switch (first) { + case 0b10000: // vfncvt.xu.f.w + return rv_ftou_d(second, vxrm, &fflags); + case 0b10001: // vfncvt.x.f.w + return rv_ftoi_d(second, vxrm, &fflags); + case 0b10010: // vfncvt.f.xu.w + return rv_lutof_s(second, vxrm, &fflags); + case 0b10011: // vfncvt.f.x.w + return rv_ltof_s(second, vxrm, &fflags); + case 0b10100: // vfncvt.f.f.w + return rv_dtof_r(second, vxrm); + case 0b10101: // vfncvt.rod.f.f.w + return rv_dtof_r(second, 6); + case 0b10110: // vfncvt.rtz.xu.f.w + return rv_ftou_d(second, 1, &fflags); + case 0b10111: // vfncvt.rtz.x.f.w + return rv_ftoi_d(second, 1, &fflags); + default: + std::cout << "Fncvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Fncvt only supports f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fcvt";} +}; + +template +class Funary1 { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + switch (first) { + case 0b00000: // vfsqrt.v + return rv_fsqrt_s(second, frm, &fflags); + case 0b00100: // vfrsqrt7.v + return rv_frsqrt7_s(second, frm, &fflags); + case 0b00101: // vfrec7.v + return rv_frecip7_s(second, frm, &fflags); + case 0b10000: // vfclass.v + return rv_fclss_s(second); + default: + std::cout << "Funary1 has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else if (sizeof(T) == 8) { + switch (first) { + case 0b00000: // vfsqrt.v + return rv_fsqrt_d(second, frm, &fflags); + case 0b00100: // vfrsqrt7.v + return rv_frsqrt7_d(second, frm, &fflags); + case 0b00101: // vfrec7.v + return rv_frecip7_d(second, frm, &fflags); + case 0b10000: // vfclass.v + return rv_fclss_d(second); + default: + std::cout << "Funary1 has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Funary1 only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Funary1";} +}; + +template +class Xunary0 { + public: + static R apply(T, T second, T) { + return second; + } + static std::string name() {return "Xunary0";} +}; + +template +class Feq { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_feq_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_feq_d(second, first, &fflags); + } else { + std::cout << "Feq only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Feq";} +}; + +template +class Fle { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fle_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_fle_d(second, first, &fflags); + } else { + std::cout << "Fle only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fle";} +}; + +template +class Flt { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_flt_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_flt_d(second, first, &fflags); + } else { + std::cout << "Flt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Flt";} +}; + +template +class Fne { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return !rv_feq_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return !rv_feq_d(second, first, &fflags); + } else { + std::cout << "Fne only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fne";} +}; + +template +class Fgt { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_flt_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_flt_d(first, second, &fflags); + } else { + std::cout << "Fgt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fgt";} +}; + +template +class Fge { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fle_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fle_d(first, second, &fflags); + } else { + std::cout << "Fge only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fge";} +}; + +template +class Fdiv { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fdiv_s(second, first, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fdiv_d(second, first, frm, &fflags); + } else { + std::cout << "Fdiv only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fdiv";} +}; + +template +class Frdiv { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fdiv_s(first, second, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fdiv_d(first, second, frm, &fflags); + } else { + std::cout << "Frdiv only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Frdiv";} +}; + +template +class Fmul { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmul_s(first, second, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmul_d(first_d, second_d, frm, &fflags); + } else { + std::cout << "Fmul only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Fmul";} +}; + +template +class Frsub { + public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fsub_s(first, second, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fsub_d(first, second, frm, &fflags); + } else { + std::cout << "Frsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() {return "Frsub";} +}; + +template +class Clip { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) { + // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to + // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling. + R firstValid = first & (sizeof(T) * 8 - 1); + T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm); + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Clip";} +}; + +template +class Smul { + public: + static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) { + R shift = sizeof(R) * 8 - 1; + T unshiftedResult = first * second; + T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm); + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() {return "Smul";} +}; + +bool isMasked(std::vector> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) { + auto& mask = vreg_file.at(maskVreg); + uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8); + uint8_t value = (emask >> (byteI % 8)) & 0x1; + DP(1, "Masking enabled: " << +!vmask << " mask element: " << +value); + return !vmask && value == 0; +} + +template +uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) { + uint32_t vsew = sizeof(DT) * 8; + return (baseVreg + (byteI / (VLEN / vsew))) % 32; +} + +template +DT &getVregData(std::vector &baseVregVec, uint32_t byteI) { + uint32_t vsew = sizeof(DT) * 8; + return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8); +} + +template +DT &getVregData(std::vector> &vreg_file, uint32_t baseVreg, uint32_t byteI) { + auto& vr1 = vreg_file.at(getVreg
(baseVreg, byteI)); + return getVregData
(vr1, byteI); +} + +template +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + if (nfields * emul > 8) { + std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl; + std::abort(); + } + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + uint32_t nfields_strided = strided ? nfields : 1; + Word mem_addr = ((rsdata[0][0].i) & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT); + Word mem_data = 0; + emul_->dcache_read(&mem_data, mem_addr, vsew / 8); + DP(1, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg
(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + DT &result = getVregData
(vreg_file, rdest + (i % nfields) * emul, i / nfields); + DP(1, "Previous data: " << +result); + result = (DT) mem_data; + } +} + +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 16: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 32: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 64: + vector_op_vix_load(vreg_file, emul_, rsdata, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + if (nfields * emul > 8) { + std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl; + std::abort(); + } + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + Word offset = 0; + switch (iSew) { + case 8: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 16: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 32: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 64: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + default: + std::cout << "Unsupported iSew: " << iSew << std::endl; + std::abort(); + } + + Word mem_addr = ((rsdata[0][0].i) & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT); + Word mem_data = 0; + emul_->dcache_read(&mem_data, mem_addr, vsew / 8); + DP(1, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg
(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + DT &result = getVregData
(vreg_file, rdest + (i % nfields) * emul, i / nfields); + DP(1, "Previous data: " << +result); + result = (DT) mem_data; + } +} + +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 16: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 32: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 64: + vector_op_vv_load(vreg_file, emul_, rsdata, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl; + std::abort(); + } +} + +void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector &rsdata) { + auto &warp = warps_.at(wid); + auto vmask = instr.getVmask(); + auto rdest = instr.getRDest(); + auto mop = instr.getVmop(); + switch (mop) { + case 0b00: { // unit-stride + auto lumop = instr.getVumop(); + switch (lumop) { + case 0b10000: // vle8ff.v, vle16ff.v, vle32ff.v, vle64ff.v - we do not support exceptions -> treat like regular unit stride + // vlseg2e8ff.v, vlseg2e16ff.v, vlseg2e32ff.v, vlseg2e64ff.v + // vlseg3e8ff.v, vlseg3e16ff.v, vlseg3e32ff.v, vlseg3e64ff.v + // vlseg4e8ff.v, vlseg4e16ff.v, vlseg4e32ff.v, vlseg4e64ff.v + // vlseg5e8ff.v, vlseg5e16ff.v, vlseg5e32ff.v, vlseg5e64ff.v + // vlseg6e8ff.v, vlseg6e16ff.v, vlseg6e32ff.v, vlseg6e64ff.v + // vlseg7e8ff.v, vlseg7e16ff.v, vlseg7e32ff.v, vlseg7e64ff.v + // vlseg8e8ff.v, vlseg8e16ff.v, vlseg8e32ff.v, vlseg8e64ff.v + case 0b0000: { // vle8.v, vle16.v, vle32.v, vle64.v + // vlseg2e8.v, vlseg2e16.v, vlseg2e32.v, vlseg2e64.v + // vlseg3e8.v, vlseg3e16.v, vlseg3e32.v, vlseg3e64.v + // vlseg4e8.v, vlseg4e16.v, vlseg4e32.v, vlseg4e64.v + // vlseg5e8.v, vlseg5e16.v, vlseg5e32.v, vlseg5e64.v + // vlseg6e8.v, vlseg6e16.v, vlseg6e32.v, vlseg6e64.v + // vlseg7e8.v, vlseg7e16.v, vlseg7e32.v, vlseg7e64.v + // vlseg8e8.v, vlseg8e16.v, vlseg8e32.v, vlseg8e64.v + WordI stride = warp.vtype.vsew / 8; + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b1000: { // vl1r.v, vl2r.v, vl4r.v, vl8r.v + uint32_t nreg = instr.getVnf() + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Whole vector register load - reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + DP(1, "Whole vector register load with nreg: " << nreg); + uint32_t vl = nreg * VLEN / instr.getVsew(); + WordI stride = instr.getVsew() / 8; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, instr.getVsew(), vl, false, stride, 1, 0, vmask); + break; + } + case 0b1011: { // vlm.v + if (warp.vtype.vsew != 8) { + std::cout << "vlm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl; + std::abort(); + } + WordI stride = warp.vtype.vsew / 8; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true); + break; + } + default: + std::cout << "Load vector - unsupported lumop: " << lumop << std::endl; + std::abort(); + } + break; + } + case 0b10: { // strided: vlse8.v, vlse16.v, vlse32.v, vlse64.v + // vlsseg2e8.v, vlsseg2e16.v, vlsseg2e32.v, vlsseg2e64.v + // vlsseg3e8.v, vlsseg3e16.v, vlsseg3e32.v, vlsseg3e64.v + // vlsseg4e8.v, vlsseg4e16.v, vlsseg4e32.v, vlsseg4e64.v + // vlsseg5e8.v, vlsseg5e16.v, vlsseg5e32.v, vlsseg5e64.v + // vlsseg6e8.v, vlsseg6e16.v, vlsseg6e32.v, vlsseg6e64.v + // vlsseg7e8.v, vlsseg7e16.v, vlsseg7e32.v, vlsseg7e64.v + // vlsseg8e8.v, vlsseg8e16.v, vlsseg8e32.v, vlsseg8e64.v + auto rsrc1 = instr.getRSrc(1); + auto rdest = instr.getRDest(); + WordI stride = warp.ireg_file.at(0).at(rsrc1); + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_load(warp.vreg_file, this, rsdata, rdest, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b01: // indexed - unordered, vluxei8.v, vluxei16.v, vluxei32.v, vluxei64.v + // vluxseg2e8.v, vluxseg2e16.v, vluxseg2e32.v, vluxseg2e64.v + // vluxseg3e8.v, vluxseg3e16.v, vluxseg3e32.v, vluxseg3e64.v + // vluxseg4e8.v, vluxseg4e16.v, vluxseg4e32.v, vluxseg4e64.v + // vluxseg5e8.v, vluxseg5e16.v, vluxseg5e32.v, vluxseg5e64.v + // vluxseg6e8.v, vluxseg6e16.v, vluxseg6e32.v, vluxseg6e64.v + // vluxseg7e8.v, vluxseg7e16.v, vluxseg7e32.v, vluxseg7e64.v + // vluxseg8e8.v, vluxseg8e16.v, vluxseg8e32.v, vluxseg8e64.v + case 0b11: { // indexed - ordered, vloxei8.v, vloxei16.v, vloxei32.v, vloxei64.v + // vloxseg2e8.v, vloxseg2e16.v, vloxseg2e32.v, vloxseg2e64.v + // vloxseg3e8.v, vloxseg3e16.v, vloxseg3e32.v, vloxseg3e64.v + // vloxseg4e8.v, vloxseg4e16.v, vloxseg4e32.v, vloxseg4e64.v + // vloxseg5e8.v, vloxseg5e16.v, vloxseg5e32.v, vloxseg5e64.v + // vloxseg6e8.v, vloxseg6e16.v, vloxseg6e32.v, vloxseg6e64.v + // vloxseg7e8.v, vloxseg7e16.v, vloxseg7e32.v, vloxseg7e64.v + // vloxseg8e8.v, vloxseg8e16.v, vloxseg8e32.v, vloxseg8e64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vv_load(warp.vreg_file, this, rsdata, instr.getRSrc(1), rdest, warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask); + break; + } + default: + std::cout << "Load vector - unsupported mop: " << mop << std::endl; + std::abort(); + } +} + +template +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + uint32_t nfields_strided = strided ? nfields : 1; + Word mem_addr = rsdata[0][0].i + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT); + Word mem_data = getVregData
(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields); + DP(1, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg
(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + emul_->dcache_write(&mem_data, mem_addr, vsew / 8); + } +} + +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 16: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 32: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 64: + vector_op_vix_store(vreg_file, emul_, rsdata, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) continue; + + Word offset = 0; + switch (iSew) { + case 8: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 16: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 32: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 64: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + default: + std::cout << "Unsupported iSew: " << iSew << std::endl; + std::abort(); + } + + Word mem_addr = rsdata[0][0].i + offset + (i % nfields) * sizeof(DT); + Word mem_data = getVregData
(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields); + DP(1, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg
(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + emul_->dcache_write(&mem_data, mem_addr, vsew / 8); + } +} + +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, std::vector &rsdata, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 16: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 32: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 64: + vector_op_vv_store(vreg_file, emul_, rsdata, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl; + std::abort(); + } +} + +void Emulator::storeVector(const Instr &instr, uint32_t wid, std::vector &rsdata) { + auto &warp = warps_.at(wid); + auto vmask = instr.getVmask(); + auto mop = instr.getVmop(); + switch (mop) { + case 0b00: { // unit-stride + auto vs3 = instr.getRSrc(1); + auto sumop = instr.getVumop(); + WordI stride = warp.vtype.vsew / 8; + switch (sumop) { + case 0b0000: { // vse8.v, vse16.v, vse32.v, vse64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b1000: { // vs1r.v, vs2r.v, vs4r.v, vs8r.v + uint32_t nreg = instr.getVnf() + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Whole vector register store - reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + DP(1, "Whole vector register store with nreg: " << nreg); + uint32_t vl = nreg * VLEN / 8; + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, vl, false, stride, 1, 0, vmask); + break; + } + case 0b1011: { // vsm.v + if (warp.vtype.vsew != 8) { + std::cout << "vsm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl; + std::abort(); + } + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true); + break; + } + default: + std::cout << "Store vector - unsupported sumop: " << sumop << std::endl; + std::abort(); + } + break; + } + case 0b10: { // strided: vsse8.v, vsse16.v, vsse32.v, vsse64.v + // vssseg2e8.v, vssseg2e16.v, vssseg2e32.v, vssseg2e64.v + // vssseg3e8.v, vssseg3e16.v, vssseg3e32.v, vssseg3e64.v + // vssseg4e8.v, vssseg4e16.v, vssseg4e32.v, vssseg4e64.v + // vssseg5e8.v, vssseg5e16.v, vssseg5e32.v, vssseg5e64.v + // vssseg6e8.v, vssseg6e16.v, vssseg6e32.v, vssseg6e64.v + // vssseg7e8.v, vssseg7e16.v, vssseg7e32.v, vssseg7e64.v + // vssseg8e8.v, vssseg8e16.v, vssseg8e32.v, vssseg8e64.v + auto rsrc1 = instr.getRSrc(1); + auto vs3 = instr.getRSrc(2); + WordI stride = warp.ireg_file.at(0).at(rsrc1); + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_store(warp.vreg_file, this, rsdata, vs3, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b01: // indexed - unordered, vsuxei8.v, vsuxei16.v, vsuxei32.v, vsuxei64.v + // vsuxseg2ei8.v, vsuxseg2ei16.v, vsuxseg2ei32.v, vsuxseg2ei64.v + // vsuxseg3ei8.v, vsuxseg3ei16.v, vsuxseg3ei32.v, vsuxseg3ei64.v + // vsuxseg4ei8.v, vsuxseg4ei16.v, vsuxseg4ei32.v, vsuxseg4ei64.v + // vsuxseg5ei8.v, vsuxseg5ei16.v, vsuxseg5ei32.v, vsuxseg5ei64.v + // vsuxseg6ei8.v, vsuxseg6ei16.v, vsuxseg6ei32.v, vsuxseg6ei64.v + // vsuxseg7ei8.v, vsuxseg7ei16.v, vsuxseg7ei32.v, vsuxseg7ei64.v + // vsuxseg8ei8.v, vsuxseg8ei16.v, vsuxseg8ei32.v, vsuxseg8ei64.v + case 0b11: { // indexed - ordered, vsoxei8.v, vsoxei16.v, vsoxei32.v, vsoxei64.v + // vsoxseg2ei8.v, vsoxseg2ei16.v, vsoxseg2ei32.v, vsoxseg2ei64.v + // vsoxseg3ei8.v, vsoxseg3ei16.v, vsoxseg3ei32.v, vsoxseg3ei64.v + // vsoxseg4ei8.v, vsoxseg4ei16.v, vsoxseg4ei32.v, vsoxseg4ei64.v + // vsoxseg5ei8.v, vsoxseg5ei16.v, vsoxseg5ei32.v, vsoxseg5ei64.v + // vsoxseg6ei8.v, vsoxseg6ei16.v, vsoxseg6ei32.v, vsoxseg6ei64.v + // vsoxseg7ei8.v, vsoxseg7ei16.v, vsoxseg7ei32.v, vsoxseg7ei64.v + // vsoxseg8ei8.v, vsoxseg8ei16.v, vsoxseg8ei32.v, vsoxseg8ei64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vv_store(warp.vreg_file, this, rsdata, instr.getRSrc(1), instr.getRSrc(2), warp.vtype.vsew, instr.getVsew(), warp.vl, nfields, warp.vtype.vlmul, vmask); + break; + } + default: + std::cout << "Store vector - unsupported mop: " << mop << std::endl; + std::abort(); + } +} + +template