diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index be4cf417bc..a7f220daa9 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -60,8 +60,8 @@ def res_estimation_complete(model):
     res_dict = {}
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
-            op_type = node.op_type
             inst = registry.getCustomOp(node)
+            op_type = inst.base_op_type()
             if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation":
                 orig_restype = inst.get_nodeattr("resType")
                 res_dict[node.name] = []
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 54ba7e4ea1..11107ccb64 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -569,7 +569,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
-            model = model.transform(ApplyConfig(cfg.folding_config_file))
+            model = model.transform(
+                ApplyConfig(
+                    cfg.folding_config_file,
+                    node_filter=lambda x: x.op_type == "StreamingFIFO",
+                )
+            )
 
     # extract the final configuration and save it as json
     hw_attrs = [
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 188f45273c..ebb5ce98da 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -50,6 +50,8 @@
 from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls
 from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
 from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
+from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls
+from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VectorVectorActivation_hls
 
 custom_op = dict()
 
@@ -75,3 +77,5 @@
 custom_op["Thresholding_hls"] = Thresholding_hls
 custom_op["TLastMarker_hls"] = TLastMarker_hls
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
+custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls
+custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
new file mode 100644
index 0000000000..2ad9fefc07
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -0,0 +1,522 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend):
+    """Corresponds to finn-hlslib MatrixVectorActivation_Batch function."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+    
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        # fill in TSrcI and TWeightI
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+        self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    def defines(self, var):
+        # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements.
+        if var == "ipgen":
+            SIMD = self.get_nodeattr("SIMD")
+            MW = self.get_nodeattr("MW")
+            condition = SIMD >= (MW / 1024)
+            msg = (
+                f"HLS synthesis of MatrixVectorActivation requires: "
+                f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
+                f"and MW={MW} for node: {self.onnx_node.name}."
+            )
+            assert condition, msg
+        mem_mode = self.get_nodeattr("mem_mode")
+        numInputVectors = list(self.get_nodeattr("numInputVectors"))
+        numReps = np.prod(numInputVectors)
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define MW1 {}\n #define MH1 {}\n
+            #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n
+            #define TMEM1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("MW"),
+                self.get_nodeattr("MH"),
+                self.get_nodeattr("SIMD"),
+                self.get_nodeattr("PE"),
+                self.calc_wmem(),
+                self.calc_tmem(),
+                numReps,
+            )
+        ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+
+    def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+                )
+            )
+
+    def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        map_to_hls_mult_style = {
+            "auto": "ap_resource_dflt()",
+            "lut": "ap_resource_lut()",
+            "dsp": "ap_resource_dsp()",
+        }
+        tmpl_args = self.get_template_param_values()
+        if self.calc_tmem() == 0:
+            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+        else:
+            threshs = "threshs"
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Matrix_Vector_Activate_Batch<MW1, MH1, SIMD1, PE1, 1, {}, {}, {}>
+                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} >
+                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                self.hls_sname(),
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_weightstream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
+            )
+
+    def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+            )
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+            )
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
+            )
+
+        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+        # partition for parallel access along PE and N_THRES
+        # dimensions (dims 1 and 3)
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+            )
+            # add resource pragma for thresholds if set
+            if ram_style_thresholds == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
+                )
+            elif ram_style_thresholds == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
+                )
+            elif ram_style_thresholds == "auto":
+                # no pragma needed
+                pass
+            else:
+                raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds)
+
+    def get_ap_int_max_w(self):
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
+        weightstream = self.get_weightstream_width()
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_normal_output_shape()
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            self.reset_rtlsim(sim)
+            self.toggle_clk(sim)
+            if mem_mode in ["external", "decoupled"]:
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to "rtlsim" """.format(
+                    mode
+                )
+            )
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
new file mode 100644
index 0000000000..51de49f1c7
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
@@ -0,0 +1,372 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend):
+    """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(VectorVectorActivation.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        # fill in TSrcI and TWeightI
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+        if self.calc_tmem() != 0:
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    def defines(self, var):
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        numReps = 1 * dim_h * dim_w
+        k_h, k_w = self.get_nodeattr("Kernel")
+        innerProdDim = k_h * k_w
+        mem_mode = self.get_nodeattr("mem_mode")
+
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define Channels1 {}\n #define InnerProdDim {}\n
+            #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("Channels"),
+                innerProdDim,
+                self.get_nodeattr("SIMD"),
+                self.get_nodeattr("PE"),
+                numReps,
+            )
+        ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+
+    def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+                )
+            )
+
+    def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        map_to_hls_mult_style = {
+            "auto": "ap_resource_dflt()",
+            "lut": "ap_resource_lut()",
+            "dsp": "ap_resource_dsp()",
+        }
+        tmpl_args = self.get_template_param_values()
+        if self.calc_tmem() == 0:
+            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+        else:
+            threshs = "threshs"
+
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
+                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
+                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+                    "Vector_Vector_Activate_Stream_Batch",
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                self.hls_sname(),
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                hls::stream<ap_uint<{}>> &out_{}
+                )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_weightstream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
+            )
+
+    def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+            )
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
+            )
+
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+            )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index 846894d85c..d8210fd684 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -472,5 +472,5 @@ def get_ap_int_max_w(self):
         instream = self.get_instream_width()
         outstream = self.get_outstream_width()
         ret = max([instream, outstream])
-        assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
+        assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 4fed8ed4b5..bc59c69192 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -31,7 +31,7 @@
 import subprocess
 import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self):
     def reset_rtlsim(self, sim):
         """Sets reset input in pyverilator to zero, toggles the clock and set it
         back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
+        reset_rtlsim(sim)
 
     def toggle_clk(self, sim):
         """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
+        toggle_clk(sim)
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals
@@ -604,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict):
             trace_file=trace_file,
             sname=sname,
             liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+            do_reset=True,
         )
         self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 6699340cac..fd5751ef7d 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -31,20 +31,32 @@
 import os
 import textwrap
 import warnings
+from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
+import qonnx.custom_op.general.xnorpopcount as xp
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.util.basic import (
     calculate_matvec_accumulator_range,
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
+    qonnx_make_model
 )
 
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+import qonnx.core.data_layout as DataLayout
+import finn.core.onnx_exec as oxe
+from qonnx.transformation.infer_shapes import InferShapes
+import onnx.numpy_helper as np_helper
+from qonnx.transformation.general import GiveUniqueNodeNames
+
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -54,9 +66,8 @@
 # the ... here can be any shape (representing groups of vectors)
 
 
-class MatrixVectorActivation(HLSCustomOp):
-    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
-    function."""
+class MatrixVectorActivation(HWCustomOp):
+    """Abstraction layer for HW implementation of MatrixVectorActivation layers."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -67,7 +78,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
@@ -122,10 +133,14 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
-        }
+            "preferred_impl_style" : ("s", False, "hls", {"hls", "rtl"}),
+            }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def base_op_type(self):
+        return "MatrixVectorActivation"
+
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         mw = self.get_nodeattr("MW")
@@ -165,6 +180,61 @@ def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
 
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        assert (
+            i_bits <= 9
+        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            assert (
+                wp <= 8
+            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            w_width = pe * simd * wp
+            return w_width
+        else:
+            return 0
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def verify_node(self):
         info_messages = []
         # verify that "backend" is set to "fpgadataflow"
@@ -385,6 +455,25 @@ def dsp_estimation(self):
         else:
             mult_dsp = 0
         return int(mult_dsp)
+# # TODO: fix DSP estimations --> depends on fpga_part
+#     def dsp_estimation(self):
+#         # multiplication
+#         # mvu_8sx9 (DSP58): ceil(SIMD/3)
+#         # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
+#         # mvu_8sx8u (DSP48): ceil(PE/2)
+#         # mvu_lut: 0
+#         P = self.get_nodeattr("PE")
+#         res_type = self.get_nodeattr("resType")
+#         Q = self.get_nodeattr("SIMD")
+#         wdt = self.get_weight_datatype()
+#         W = wdt.bitwidth()
+#         idt = self.get_input_datatype()
+#         A = idt.bitwidth()
+#         if res_type == "dsp":
+#             mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+#         else:
+#             mult_dsp = 0
+#         return int(mult_dsp)
 
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
@@ -397,6 +486,27 @@ def get_exp_cycles(self):
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
+# # TODO: fix exp_cycles estimations --> depends on fpga_part and clk
+#     def get_exp_cycles(self):
+#         # mvu_8sx9 (DSP58):
+#         # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
+#         # + MW/SIMD * MH/PE
+#         # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
+#         # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
+#         # + MW/SIMD * MH/PE
+#         # mvu_lut:
+#         # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
+#         # + MW/SIMD * MH/PE
+#         pe = self.get_nodeattr("PE")
+#         simd = self.get_nodeattr("SIMD")
+#         num_inp_vec = self.get_nodeattr("numInputVectors")
+#         mh = self.get_nodeattr("MH")
+#         mw = self.get_nodeattr("MW")
+#         # since mmv != 1 is not supported yet, we set mmv for now to 1
+#         mmv = 1     
+#         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+#         return int(exp_cycles)
+
     def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         # when performing FIFO insertion on an FC layer with ext weights, the ind
@@ -450,17 +560,6 @@ def get_weightstream_width_padded(self):
         weight_width = self.get_weightstream_width()
         return roundup_to_integer_multiple(weight_width, 8)
 
-    def get_ap_int_max_w(self):
-        # base class impl (max of inp/out stream widths)
-        max_of_io = super().get_ap_int_max_w()
-        # decoupled mode weight stream
-        weightstream = self.get_weightstream_width()
-        # single PE weight entry
-        weight_bits = self.get_weight_datatype().bitwidth()
-        simd = self.get_nodeattr("SIMD")
-        single_pe_w = simd * weight_bits
-        return max([weightstream, max_of_io, single_pe_w])
-
     def get_folded_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
         mh = self.get_nodeattr("MH")
@@ -505,82 +604,6 @@ def get_number_output_values(self):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
-    def get_template_param_values(self):
-        """Returns the template parameter values according to input, output and weight
-        data types."""
-        ret = dict()
-        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
-        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
-        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
-            raise Exception("True binary (non-bipolar) inputs not yet supported")
-        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
-        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-        # fill in TSrcI and TWeightI
-        # TODO check these with Giulio
-        # TODO handle non-bipolar binary inputs
-        if inp_is_bipolar and wt_is_bipolar:
-            ret["TSrcI"] = "Recast<XnorMul>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and wt_is_bipolar:
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Recast<Binary>"
-        elif inp_is_bipolar and (not wt_is_bipolar):
-            ret["TSrcI"] = "Recast<Binary>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and (not wt_is_bipolar):
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Identity"
-
-        # fill in TDstI
-        ret["TDstI"] = "Slice<%s>" % out_hls_str
-
-        return ret
-
-    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
-        """Convert the original numpy weight matrix orig_weight_matrix into
-        a form suitable for passing to the hlslib call:
-        * ensure MH % PE == 0 and MW % SIMD == 0
-        * for bipolar {-1,+1} weights, convert to binary {0, 1}
-        * interleave rows between PEs
-        * reshape into (1, PE, WMEM, SIMD) and return
-        """
-        mw = self.get_nodeattr("MW")
-        mh = self.get_nodeattr("MH")
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wmem = self.calc_wmem()
-        assert orig_weight_matrix.shape == (
-            mw,
-            mh,
-        ), """Weights matrix doesn't
-        have expected shape (mw, mh)"""
-        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
-        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-        # start by transposing the original weight matrix, since ONNX and
-        # finn-hlslib use different assumptions
-        # ONNX uses (in_features, out_features) and matmul(x, W)
-        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
-        ret = orig_weight_matrix.T
-        if self.get_weight_datatype() == DataType["BIPOLAR"]:
-            # convert bipolar to binary
-            ret = (ret + 1) / 2
-        # interleave rows between PEs and reshape
-        # distribute rows between PEs
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        # create SIMD as innermost dimension and add a dummy outer dim
-        ret = ret.reshape(1, pe, wmem, simd)
-        # reverse the SIMD dimension
-        ret = np.flip(ret, axis=-1)
-        return ret
-
     def minimize_accumulator_width(self, model):
         """Minimize the accumulator bit width according to the weight values,
         input data types, and size of dot product"""
@@ -728,6 +751,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            mw,
+            mh,
+        ), """Weights matrix doesn't
+        have expected shape (mw, mh)"""
+        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
     def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """Produce a file containing given weights in appropriate format for this
         layer. This file can be used for either synthesis or run-time reconfig
@@ -905,402 +965,68 @@ def generate_params(self, model, path):
                 f_thresh.write(thresholds_hls_code)
                 f_thresh.close()
 
-    def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        mem_mode = self.get_nodeattr("mem_mode")
-        node = self.onnx_node
-
-        # TODO ensure codegen dir exists
-        if mode == "cppsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        elif mode == "rtlsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-        # create a npy file fore each input of the node (in_ind is input index)
-        in_ind = 0
-        for inputs in node.input:
-            # it is assumed that the first input of the node is the data input
-            # the second input are the weights
-            # the third input are the thresholds
-            if in_ind == 0:
-                assert (
-                    str(context[inputs].dtype) == "float32"
-                ), """Input datatype is
-                not float32 as expected."""
-                expected_inp_shape = self.get_folded_input_shape()
-                reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType["BIPOLAR"]:
-                    # store bipolar activations as binary
-                    reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType["BINARY"]
-                else:
-                    export_idt = self.get_input_datatype()
-                # make copy before saving the array
-                reshaped_input = reshaped_input.copy()
-                np.save(
-                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                    reshaped_input,
-                )
-            elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation")
-            in_ind += 1
-
-        if mode == "cppsim":
-            # execute the precompiled model
-            super().exec_precompiled_singlenode_model()
-            # load output npy file
-            super().npy_to_dynamic_output(context)
-            # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType["BIPOLAR"]:
-                out = context[node.output[0]]
-                out = 2 * out - 1
-                context[node.output[0]] = out
-            assert (
-                context[node.output[0]].shape == self.get_normal_output_shape()
-            ), "cppsim did not produce expected output shape"
-        elif mode == "rtlsim":
-            sim = self.get_rtlsim()
-            nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            super().reset_rtlsim(sim)
-            super().toggle_clk(sim)
-            if mem_mode == "external" or mem_mode == "decoupled":
-                wnbits = self.get_weightstream_width()
-                export_wdt = self.get_weight_datatype()
-                # we have converted bipolar weights to binary for export,
-                # so use it as such for weight generation
-                if self.get_weight_datatype() == DataType["BIPOLAR"]:
-                    export_wdt = DataType["BINARY"]
-                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
-                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-                io_dict = {
-                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
-                    "outputs": {"out": []},
-                }
-                self.rtlsim_multi_io(sim, io_dict)
-                output = io_dict["outputs"]["out"]
-            else:
-                output = self.rtlsim(sim, inp)
-            odt = self.get_output_datatype()
-            target_bits = odt.bitwidth()
-            packed_bits = self.get_outstream_width()
-            out_npy_path = "{}/output.npy".format(code_gen_dir)
-            out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
-
-            # load and reshape output
-            output = np.load(out_npy_path)
-            oshape = self.get_normal_output_shape()
-            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-            context[node.output[0]] = output
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-    def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode not in ["const", "decoupled", "external"]:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-        self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
-            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
-
-    def defines(self, var):
-        # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements.
-        if var == "ipgen":
-            SIMD = self.get_nodeattr("SIMD")
-            MW = self.get_nodeattr("MW")
-            condition = SIMD >= (MW / 1024)
-            msg = (
-                f"HLS synthesis of MatrixVectorActivation requires: "
-                f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
-                f"and MW={MW} for node: {self.onnx_node.name}."
-            )
-            assert condition, msg
-        mem_mode = self.get_nodeattr("mem_mode")
-        numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = np.prod(numInputVectors)
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define MW1 {}\n #define MH1 {}\n
-            #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n
-            #define TMEM1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("MW"),
-                self.get_nodeattr("MH"),
-                self.get_nodeattr("SIMD"),
-                self.get_nodeattr("PE"),
-                self.calc_wmem(),
-                self.calc_tmem(),
-                numReps,
-            )
-        ]
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
-
-    def read_npy_data(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_input_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_in = "%s/input_0.npy" % code_gen_dir
-        self.code_gen_dict["$READNPYDATA$"] = []
-        # note: the innermost dim is reversed for the input
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
-            )
-        )
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            elem_bits = wdt.bitwidth()
-            packed_bits = self.get_weightstream_width()
-            packed_hls_type = "ap_uint<%d>" % packed_bits
-            elem_hls_type = wdt.get_hls_datatype_str()
-            npy_type = "float"
-            npy_in = "%s/weights.npy" % code_gen_dir
-
-            self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
-                % (
-                    packed_hls_type,
-                    elem_hls_type,
-                    elem_bits,
-                    npy_type,
-                    npy_in,
-                    self.hls_sname(),
-                )
-            )
-
-    def strm_decl(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
-                self.get_instream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
-                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-
-        if mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
-                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
-                )
-            )
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = out_features
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
 
-    def docompute(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        map_to_hls_mult_style = {
-            "auto": "ap_resource_dflt()",
-            "lut": "ap_resource_lut()",
-            "dsp": "ap_resource_dsp()",
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
         }
-        tmpl_args = self.get_template_param_values()
-        if self.calc_tmem() == 0:
-            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
-            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
-        else:
-            threshs = "threshs"
-        if mem_mode == "const":
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """Matrix_Vector_Activate_Batch<MW1, MH1, SIMD1, PE1, 1, {}, {}, {}>
-                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            if wdt == DataType["BIPOLAR"]:
-                export_wdt = DataType["BINARY"]
-            else:
-                export_wdt = wdt
-            wdtype_hls_str = export_wdt.get_hls_datatype_str()
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} >
-                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    wdtype_hls_str,
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-
-    def dataoutstrm(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_output_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_out = "%s/output.npy" % code_gen_dir
-        shape = self.get_folded_output_shape()
-        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
-
-        # note: the innermost dim is not reversed for the output
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                shape_cpp_str,
-                npy_out,
-            )
-        ]
-
-    def save_as_npy(self):
-        self.code_gen_dict["$SAVEASCNPY$"] = []
-
-    def blackboxfunction(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0_{},
-                    hls::stream<ap_uint<{}>> &out_{}
-                    )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(
-                    hls::stream<ap_uint<{}>> &in0_{},
-                    hls::stream<ap_uint<{}>> &weights_{},
-                    hls::stream<ap_uint<{}>> &out_{}
-                    )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_weightstream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-
-        else:
-            raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
-            )
-
-    def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
-        ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
-        self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
-        ]
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
-
-        if mem_mode == "const":
-            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-            # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
-            # partition for parallel access along the PE dimension (dim 1)
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-            )
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
-            )
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        in_act = context[node.input[0]]
+        mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
+        mvau_w = np_helper.to_array(mvau_w_init)
+        # Matrix multiplication
+        if self.get_nodeattr("binaryXnorMode"):
+            # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode)
+            result = xp.xnorpopcountmatmul(in_act, mvau_w)
+        elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"):
+            result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2)
         else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or external,
-                currently no other parameter value is supported!"""
-            )
-
-        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
-        # partition for parallel access along PE and N_THRES
-        # dimensions (dims 1 and 3)
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
-            )
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
-            )
-            # add resource pragma for thresholds if set
-            if ram_style_thresholds == "distributed":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
-                )
-            elif ram_style_thresholds == "block":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
-                )
-            elif ram_style_thresholds == "auto":
-                # no pragma needed
-                pass
-            else:
-                raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds)
+            result = np.matmul(in_act, mvau_w)
+        # Thresholding if noActivation==0
+        if self.get_nodeattr("noActivation") == 0:
+            mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0]
+            mvau_thr = np_helper.to_array(mvau_thr_init)
+            odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"]
+            out_scale = 2 if odt_is_bipolar else 1
+            out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal")
+            result = multithreshold(result, mvau_thr, out_scale, out_bias)
+        
+        context[node.output[0]] = result
 
     def code_generation_ipi(self):
         cmd = []
@@ -1324,22 +1050,51 @@ def code_generation_ipi(self):
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
             )
             cmd.append(
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
+            is_rtl_op = self.__class__.__name__ == "MatrixVectorActivation_rtl"
+            if is_rtl_op:
+                # instantiate the RTL block
+                code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+                rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+                sourcefiles = [
+                    os.path.join(
+                        code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                    ),
+                    rtllib_dir + "mvu_vvu_axi.sv",
+                    rtllib_dir + "replay_buffer.sv",
+                    rtllib_dir + "mvu_4sx4u.sv",
+                    rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                    rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                ]
+                for f in sourcefiles:
+                    cmd.append("add_files -norecurse %s" % (f))
+                cmd.append(
+                    "create_bd_cell -type hier -reference %s /%s/%s"
+                    % (
+                        self.get_nodeattr("gen_top_module"),
+                        self.onnx_node.name,
+                        self.onnx_node.name,
+                    )
+                )
+            else:
+                # instantiate the hls ip
+                cmd.append(
+                    "create_bd_cell -type ip -vlnv %s /%s/%s"
+                    % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+                )
+
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
             )
             cmd.append(
                 "set_property -dict [list "
@@ -1393,7 +1148,8 @@ def code_generation_ipi(self):
                 axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
                 cmd.append(
                     "create_bd_intf_pin -mode Slave "
-                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
                 )
                 cmd.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
@@ -1404,60 +1160,32 @@ def code_generation_ipi(self):
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
         elif mem_mode == "const" or mem_mode == "external":
-            # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+            if is_rtl_op and mem_mode == "external":
+                # instantiate the RTL block
+                code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+                rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+                sourcefiles = [
+                    os.path.join(
+                        code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                    ),
+                    rtllib_dir + "mvu_vvu_axi.sv",
+                    rtllib_dir + "replay_buffer.sv",
+                    rtllib_dir + "mvu_4sx4u.sv",
+                    rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                    rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                ]
+                for f in sourcefiles:
+                    cmd.append("add_files -norecurse %s" % (f))
+                cmd.append(
+                    "create_bd_cell -type module -reference %s %s"
+                    % (
+                        self.get_nodeattr("gen_top_module"),
+                        self.onnx_node.name,
+                    )
+                )
+            else:
+                # base class impl sufficient for const/external modes
+                return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
-        return cmd
-
-    def get_verilog_top_module_intf_names(self):
-        intf_names = super().get_verilog_top_module_intf_names()
-        mem_mode = self.get_nodeattr("mem_mode")
-        sname = self.hls_sname()
-        if mem_mode == "external":
-            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
-        if mem_mode == "decoupled":
-            # only expose axilite interface if attribute is set
-            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
-            if runtime_writable:
-                intf_names["axilite"] = ["s_axilite"]
-        return intf_names
-
-    def get_op_and_param_counts(self):
-        in_features = self.get_nodeattr("MW")
-        out_features = self.get_nodeattr("MH")
-        weight_bits = self.get_weight_datatype().bitwidth()
-        inp_bits = self.get_input_datatype().bitwidth()
-        num_inp_vec = self.get_nodeattr("numInputVectors")
-        num_repetitions = int(np.prod(num_inp_vec))
-        mac_count = in_features * out_features * num_repetitions
-        # cannonicalize op type: highest bitwidth operand first s.t.
-        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
-        bw1 = min(inp_bits, weight_bits)
-        bw2 = max(inp_bits, weight_bits)
-        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
-        weight_param_type = "param_weight_%db" % (weight_bits)
-        weight_count = in_features * out_features
-        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = out_features
-            ret_dict[thres_param_type] = thres_count
-        return ret_dict
-
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+        return cmd
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index bd5bb75f1d..2168474298 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -38,17 +38,21 @@
     roundup_to_integer_multiple,
 )
 
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+import onnx.numpy_helper as np_helper
+import qonnx.custom_op.general.xnorpopcount as xp
+from qonnx.custom_op.general.multithreshold import multithreshold
 
 
-class VectorVectorActivation(HLSCustomOp):
-    """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+class VectorVectorActivation(HWCustomOp):
+    """Abstraction layer for HW implementation of VectorVectorActivation layers."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -100,128 +104,66 @@ def get_nodeattr_types(self):
             # use xnor-popcount for binary weights/inputs, thus treating them
             # as bipolar
             "binaryXnorMode": ("i", False, 0, {0, 1}),
+            # Backend implementation for layer
+            # hls -- Vivado HLS
+            # rtl -- (System)Verilog
+            "preferred_impl_style": ("s", False, "hls", {"hls", "rtl"}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def minimize_accumulator_width(self, model):
-        """Minimize the accumulator bit width according to the weight values,
-        input data types, and size of dot product"""
-        weights = model.get_initializer(self.onnx_node.input[1])
-        k_h, k_w = self.get_nodeattr("Kernel")
-        fm = self.get_nodeattr("Channels")
-        # put weights into the shape expected by calculate_matvec_accumulator_range
-        weights = weights.reshape(fm, k_h * k_w).transpose()
-        # since in the calculation the values of the weight matrix are used,
-        # for the bipolar case they need to be converted to bipolar
-        if self.get_nodeattr("binaryXnorMode"):
-            weights = 2 * weights - 1
-        if len(self.onnx_node.input) > 2:
-            thresholds = model.get_initializer(self.onnx_node.input[2])
-        else:
-            thresholds = None
-        idt = self.get_input_datatype()
-
-        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
-        # if runtime-writeable weights, then the values of the weights can
-        # change and we need to use the worst-case values from the datatypes
-        if self.get_nodeattr("runtime_writeable_weights"):
-            wdt = self.get_weight_datatype()
-            lower_worst = wdt.min() * np.ones_like(weights)
-            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
-            upper_worst = wdt.max() * np.ones_like(weights)
-            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
-            acc_min = min(min(lower_range), min(upper_range))
-            acc_max = max(max(upper_range), max(upper_range))
+    def base_op_type(self):
+        return "VectorVectorActivation"
 
-        # if the thresholds can be used to determine range, then adjust the range
-        # according to the known values of the thresholds
-        if thresholds is not None:
-            threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-            # set threshold datatype (and accumulator datatype implicitly)
-            min_threshold = thresholds.min()
-            max_threshold = thresholds.max()
-            # clip threshold values
-            if max_threshold > acc_max or min_threshold < acc_min:
-                warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
-                thresholds = np.clip(thresholds, acc_min, acc_max)
-                model.set_initializer(self.onnx_node.input[2], thresholds)
-                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-                min_threshold = thresholds.min()
-                max_threshold = thresholds.max()
-            acc_min = min(min_threshold, acc_min)
-            acc_max = max(max_threshold, acc_max)
+    def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels):
+        W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32)
+        for ch in range(channels):
+            W_sparse[ch][ch] = W_conv[ch][0]
+        W_conv = W_sparse.astype(np.float32)
+        W_matmul = W_conv.transpose(0, 2, 3, 1)
+        W_matmul = W_matmul.reshape(channels, channels * k_h * k_w)
+        W_matmul = W_matmul.T
+        return W_matmul
 
-        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
-        if acc_min >= 0:
-            acc_bit_width = np.log2(acc_max + 1)
-            acc_bit_width = math.ceil(acc_bit_width)
-            adt = DataType[f"UINT{acc_bit_width}"]
-        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
-        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        in_act = context[node.input[0]]
+        (_, dim_h, dim_w, _) = in_act.shape
+        (k_h, k_w) = self.get_nodeattr("Kernel")
+        channels = self.get_nodeattr("Channels")
+        # Reshape input activations in right format
+        in_act = in_act.reshape(1, dim_h, dim_w, channels, k_h*k_w)
+        in_act = in_act.transpose(0, 1, 2, 4, 3)
+        in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w)
+        # Reshape
+        vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
+        vvau_w = np_helper.to_array(vvau_w_init)
+        vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels)
+
+        if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR":
+            result = np.matmul(in_act, vvau_w_onnx)
+            result = (result + k_h*k_w) / 2
         else:
-            _acc_max = max(-acc_min, 1 + acc_max)
-            acc_bit_width = np.log2(_acc_max) + 1
-            acc_bit_width = math.ceil(acc_bit_width)
-            adt = DataType[f"INT{acc_bit_width}"]
-
-        # if activation, assert that the thresholds can be expressed with adt
-        if thresholds is not None:
-            assert np.vectorize(adt.allowed)(
-                threshold_tensor
-            ).all(), "Thresholds in %s can't be expressed with type %s" % (
-                self.onnx_node.name,
-                str(adt),
-            )
-
-        # if no activation, output and accumulator datatypes are the same
-        if self.get_nodeattr("noActivation"):
-            # if this is the last node in the graph, then ensure the datatype is
-            # divisibly by 8 bits
-            if model.find_direct_successors(self.onnx_node) is None:
-                bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-                new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-                adt = DataType[new_adt_name]
-            # for no-activation nodes, output dt = acc dt
-            self.set_nodeattr("outputDataType", adt.name)
-        self.set_nodeattr("accDataType", adt.name)
-
-        return DataType[self.get_nodeattr("accDataType")]
-
-    def minimize_weight_bit_width(self, model):
-        """Minimize the bit width based on the values of the weights"""
-        if not self.get_nodeattr("runtime_writeable_weights"):
-            weights = model.get_initializer(self.onnx_node.input[1])
-            w_min = weights.min()
-            w_max = weights.max()
-            if w_min < 0:
-                if abs(w_min) > w_max:
-                    wdt = DataType.get_smallest_possible(w_min)
-                else:
-                    wdt = DataType.get_smallest_possible(-w_max - 1)
-            else:
-                wdt = DataType.get_smallest_possible(w_max)
-            self.set_nodeattr("weightDataType", wdt.name)
-        return DataType[self.get_nodeattr("weightDataType")]
-
-    def calc_wmem(self):
-        """Calculates and returns WMEM."""
-        ch = self.get_nodeattr("Channels")
-        k_h, k_w = self.get_nodeattr("Kernel")
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wmem = (k_h * k_w * ch // pe) // simd
-        return wmem
+            result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format
 
-    def calc_tmem(self):
-        """Calculates and returns TMEM."""
-        if self.get_nodeattr("noActivation") == 1:
-            return 0
-        else:
-            ch = self.get_nodeattr("Channels")
-            pe = self.get_nodeattr("PE")
-            return ch // pe
+        if self.get_nodeattr("noActivation") == 0:
+            vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0]
+            vvau_thr = np_helper.to_array(vvau_thr_init)
+            odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"]
+            out_scale = 2 if odt_is_bipolar else 1
+            out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal")
+            # NHWC to NCHW for multithreshold node
+            result = result.transpose((0,3,1,2))
+            result = multithreshold(result, vvau_thr, out_scale, out_bias)
+            # NCHW to NHWC
+            result = result.transpose((0,2,3,1))
+        
+        # for i in range(self.get_nodeattr("Channels")):
+        context[node.output[0]] = result
 
+    def verify_node(self):
+        pass
+  
     def make_shape_compatible_op(self, model):
         oshape = self.get_normal_output_shape()
         return super().make_const_shape_op(oshape)
@@ -241,9 +183,6 @@ def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
 
-    def verify_node(self):
-        pass
-
     def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
@@ -266,12 +205,32 @@ def get_instream_width(self, ind=0):
         pe = self.get_nodeattr("PE")
         in_width = i_bits * simd * pe
         return in_width
+    
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            simd = self.get_nodeattr("SIMD")
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = simd * pe * wp
+            return w_width
+        else:
+            return 0
 
     def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
 
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def get_folded_input_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("Kernel")
         dim_h, dim_w = self.get_nodeattr("Dim")
@@ -320,89 +279,303 @@ def get_number_output_values(self):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
-    def get_exp_cycles(self):
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
         ch = self.get_nodeattr("Channels")
-        dim_h, dim_w = self.get_nodeattr("Dim")
         k_h, k_w = self.get_nodeattr("Kernel")
-        # currently FINN supports for vvau a batch size of 1
-        batch_size = 1
-        # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
-        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
-        return int(exp_cycles)
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = (k_h * k_w * ch // pe) // simd
+        return wmem
 
-    def get_template_param_values(self):
-        """Returns the template parameter values according to input, output and weight
-        data types."""
-        ret = dict()
-        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
-        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
-        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
-            raise Exception("True binary (non-bipolar) inputs not yet supported")
-        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
-        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-        # fill in TSrcI and TWeightI
-        # TODO check these with Giulio
-        # TODO handle non-bipolar binary inputs
-        if inp_is_bipolar and wt_is_bipolar:
-            ret["TSrcI"] = "Recast<XnorMul>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and wt_is_bipolar:
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Recast<Binary>"
-        elif inp_is_bipolar and (not wt_is_bipolar):
-            ret["TSrcI"] = "Recast<Binary>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and (not wt_is_bipolar):
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Identity"
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        if self.get_nodeattr("noActivation") == 1:
+            return 0
+        else:
+            ch = self.get_nodeattr("Channels")
+            pe = self.get_nodeattr("PE")
+            return ch // pe
 
-        # fill in TDstI
-        ret["TDstI"] = "Slice<%s>" % out_hls_str
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const")
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
 
-        return ret
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM"""
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # since this is HLS memory, not using the full width of a BRAM
+        # assuming memories up to 128 deep get implemented in LUTs
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mstyle == "auto" and self.calc_wmem() <= 128)
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
 
-    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        ch = self.get_nodeattr("Channels")
-        k_h, k_w = self.get_nodeattr("Kernel")
-        wmem = self.calc_wmem()
-        assert orig_weight_matrix.shape == (
-            ch,
-            1,
-            k_h,
-            k_w,
-        ), """Weights matrix doesn't
-        have expected shape (channels, 1, kernel_size, kernel_size)"""
-        ret = orig_weight_matrix
-        if self.get_weight_datatype() == DataType["BIPOLAR"]:
-            # convert bipolar to binary
-            ret = (ret + 1) / 2
-        ret = ret.reshape(ch, k_h * k_w)
-        # distribute rows between PEs
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        ret = ret.reshape(1, pe, wmem, simd)
-        return ret
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
 
-    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
-        """Convert the original numpy weight matrix orig_weight_matrix into
-        a form suitable for passing to the hlslib call:
-        * ensure MH % PE == 0
-        * for bipolar weights&inputs, ensure thresholds are positive
-        * interleave rows between PEs
-        * reshape into (PE, TMEM, n_thres_steps) and return
-        """
+    def bram_efficiency_estimation(self):
+        P = self.get_nodeattr("PE")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * P * omega
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = int(np.prod(self.get_nodeattr("Kernel")))
+        D_out = self.get_nodeattr("Channels")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        acc_bits = acc_datatype.bitwidth()
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        # TODO - add 'ram_style_threshold' node attribute
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
+
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        ch = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # currently FINN supports for vvau a batch size of 1
+        batch_size = 1
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
+        return int(exp_cycles)
+
+    def minimize_accumulator_width(self, model):
+        """Minimize the accumulator bit width according to the weight values,
+        input data types, and size of dot product"""
+        weights = model.get_initializer(self.onnx_node.input[1])
+        k_h, k_w = self.get_nodeattr("Kernel")
+        fm = self.get_nodeattr("Channels")
+        # put weights into the shape expected by calculate_matvec_accumulator_range
+        weights = weights.reshape(fm, k_h * k_w).transpose()
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+        else:
+            thresholds = None
+        idt = self.get_input_datatype()
+
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        # if runtime-writeable weights, then the values of the weights can
+        # change and we need to use the worst-case values from the datatypes
+        if self.get_nodeattr("runtime_writeable_weights"):
+            wdt = self.get_weight_datatype()
+            lower_worst = wdt.min() * np.ones_like(weights)
+            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
+            upper_worst = wdt.max() * np.ones_like(weights)
+            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
+            acc_min = min(min(lower_range), min(upper_range))
+            acc_max = max(max(upper_range), max(upper_range))
+
+        # if the thresholds can be used to determine range, then adjust the range
+        # according to the known values of the thresholds
+        if thresholds is not None:
+            threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+            # set threshold datatype (and accumulator datatype implicitly)
+            min_threshold = thresholds.min()
+            max_threshold = thresholds.max()
+            # clip threshold values
+            if max_threshold > acc_max or min_threshold < acc_min:
+                warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
+                thresholds = np.clip(thresholds, acc_min, acc_max)
+                model.set_initializer(self.onnx_node.input[2], thresholds)
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                min_threshold = thresholds.min()
+                max_threshold = thresholds.max()
+            acc_min = min(min_threshold, acc_min)
+            acc_max = max(max_threshold, acc_max)
+
+        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
+        if acc_min >= 0:
+            acc_bit_width = np.log2(acc_max + 1)
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"UINT{acc_bit_width}"]
+        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
+        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+        else:
+            _acc_max = max(-acc_min, 1 + acc_max)
+            acc_bit_width = np.log2(_acc_max) + 1
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"INT{acc_bit_width}"]
+
+        # if activation, assert that the thresholds can be expressed with adt
+        if thresholds is not None:
+            assert np.vectorize(adt.allowed)(
+                threshold_tensor
+            ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                self.onnx_node.name,
+                str(adt),
+            )
+
+        # if no activation, output and accumulator datatypes are the same
+        if self.get_nodeattr("noActivation"):
+            # if this is the last node in the graph, then ensure the datatype is
+            # divisibly by 8 bits
+            if model.find_direct_successors(self.onnx_node) is None:
+                bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+                new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+                adt = DataType[new_adt_name]
+            # for no-activation nodes, output dt = acc dt
+            self.set_nodeattr("outputDataType", adt.name)
+        self.set_nodeattr("accDataType", adt.name)
+
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def minimize_weight_bit_width(self, model):
+        """Minimize the bit width based on the values of the weights"""
+        if not self.get_nodeattr("runtime_writeable_weights"):
+            weights = model.get_initializer(self.onnx_node.input[1])
+            w_min = weights.min()
+            w_max = weights.max()
+            if w_min < 0:
+                if abs(w_min) > w_max:
+                    wdt = DataType.get_smallest_possible(w_min)
+                else:
+                    wdt = DataType.get_smallest_possible(-w_max - 1)
+            else:
+                wdt = DataType.get_smallest_possible(w_max)
+            self.set_nodeattr("weightDataType", wdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         tmem = self.calc_tmem()
@@ -446,6 +619,29 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        ch = self.get_nodeattr("Channels")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            ch,
+            1,
+            k_h,
+            k_w,
+        ), """Weights matrix doesn't
+        have expected shape (channels, 1, kernel_size, kernel_size)"""
+        ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
+        ret = ret.reshape(ch, k_h * k_w)
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        ret = ret.reshape(1, pe, wmem, simd)
+        return ret
+
     def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """Produce a file containing given weights in appropriate format for this
         layer. This file can be used for either synthesis or run-time reconfig
@@ -623,384 +819,44 @@ def generate_params(self, model, path):
                 f_thresh.write(thresholds_hls_code)
                 f_thresh.close()
 
-    def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        mem_mode = self.get_nodeattr("mem_mode")
-        node = self.onnx_node
-
-        # TODO ensure codegen dir exists
-        if mode == "cppsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        elif mode == "rtlsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-        # create a npy file fore each input of the node (in_ind is input index)
-        in_ind = 0
-        for inputs in node.input:
-            # it is assumed that the first input of the node is the data input
-            # the second input are the weights
-            # the third input are the thresholds
-            if in_ind == 0:
-                assert (
-                    str(context[inputs].dtype) == "float32"
-                ), """Input datatype is
-                not float32 as expected."""
-                expected_inp_shape = self.get_folded_input_shape()
-                reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType["BIPOLAR"]:
-                    # store bipolar activations as binary
-                    reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType["BINARY"]
-                else:
-                    export_idt = self.get_input_datatype()
-                # make copy before saving the array
-                reshaped_input = reshaped_input.copy()
-                np.save(
-                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                    reshaped_input,
-                )
-            elif in_ind > 2:
-                raise Exception("Unexpected input found for VectorVectorActivation")
-            in_ind += 1
-
-        if mode == "cppsim":
-            # execute the precompiled model
-            super().exec_precompiled_singlenode_model()
-            # load output npy file
-            super().npy_to_dynamic_output(context)
-            # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType["BIPOLAR"]:
-                out = context[node.output[0]]
-                out = 2 * out - 1
-                context[node.output[0]] = out
-            assert (
-                context[node.output[0]].shape == self.get_normal_output_shape()
-            ), "cppsim did not produce expected output shape"
-        elif mode == "rtlsim":
-            sim = self.get_rtlsim()
-            nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            super().reset_rtlsim(sim)
-            super().toggle_clk(sim)
-
-            if mem_mode == "external" or mem_mode == "decoupled":
-                wnbits = self.get_weightstream_width()
-                export_wdt = self.get_weight_datatype()
-                # we have converted bipolar weights to binary for export,
-                # so use it as such for weight generation
-                if self.get_weight_datatype() == DataType["BIPOLAR"]:
-                    export_wdt = DataType["BINARY"]
-                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
-                dim_h, dim_w = self.get_nodeattr("Dim")
-                num_w_reps = dim_h * dim_w
-
-                io_dict = {
-                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
-                    "outputs": {"out": []},
-                }
-                self.rtlsim_multi_io(sim, io_dict)
-                output = io_dict["outputs"]["out"]
-            else:
-                output = self.rtlsim(sim, inp)
-            odt = self.get_output_datatype()
-            target_bits = odt.bitwidth()
-            packed_bits = self.get_outstream_width()
-            out_npy_path = "{}/output.npy".format(code_gen_dir)
-            out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
-
-            # load and reshape output
-            output = np.load(out_npy_path)
-            oshape = self.get_normal_output_shape()
-            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-            context[node.output[0]] = output
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-    def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode not in ["const", "decoupled", "external"]:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-        if self.calc_tmem() != 0:
-            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
-
-    def defines(self, var):
-        dim_h, dim_w = self.get_nodeattr("Dim")
-        numReps = 1 * dim_h * dim_w
+    def get_op_and_param_counts(self):
         k_h, k_w = self.get_nodeattr("Kernel")
-        innerProdDim = k_h * k_w
-        mem_mode = self.get_nodeattr("mem_mode")
-
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define Channels1 {}\n #define InnerProdDim {}\n
-            #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("Channels"),
-                innerProdDim,
-                self.get_nodeattr("SIMD"),
-                self.get_nodeattr("PE"),
-                numReps,
-            )
-        ]
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
-
-    def read_npy_data(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_input_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_in = "%s/input_0.npy" % code_gen_dir
-        self.code_gen_dict["$READNPYDATA$"] = []
-        # note: the innermost dim is reversed for the input
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
-            )
-        )
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            elem_bits = wdt.bitwidth()
-            packed_bits = self.get_weightstream_width()
-            packed_hls_type = "ap_uint<%d>" % packed_bits
-            elem_hls_type = wdt.get_hls_datatype_str()
-            npy_type = "float"
-            npy_in = "%s/weights.npy" % code_gen_dir
-
-            self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
-                % (
-                    packed_hls_type,
-                    elem_hls_type,
-                    elem_bits,
-                    npy_type,
-                    npy_in,
-                    self.hls_sname(),
-                )
-            )
-
-    def strm_decl(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
-                self.get_instream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
-                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-        if mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
-                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
-                )
-            )
+        fm = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_repetitions = int(dim_h * dim_w)
+        mac_count = k_h * k_w * fm * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = k_h * k_w * fm
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = fm
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
 
-    def docompute(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        map_to_hls_mult_style = {
-            "auto": "ap_resource_dflt()",
-            "lut": "ap_resource_lut()",
-            "dsp": "ap_resource_dsp()",
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
         }
-        tmpl_args = self.get_template_param_values()
-        if self.calc_tmem() == 0:
-            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
-            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
-        else:
-            threshs = "threshs"
-
-        if mem_mode == "const":
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
-                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            if wdt == DataType["BIPOLAR"]:
-                export_wdt = DataType["BINARY"]
-            else:
-                export_wdt = wdt
-            wdtype_hls_str = export_wdt.get_hls_datatype_str()
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
-                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
-                    "Vector_Vector_Activate_Stream_Batch",
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    wdtype_hls_str,
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-
-    def dataoutstrm(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_output_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_out = "%s/output.npy" % code_gen_dir
-        shape = self.get_folded_output_shape()
-        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
-
-        # note: the innermost dim is not reversed for the output
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                shape_cpp_str,
-                npy_out,
-            )
-        ]
-
-    def save_as_npy(self):
-        self.code_gen_dict["$SAVEASCNPY$"] = []
-
-    def blackboxfunction(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0_{},
-                hls::stream<ap_uint<{}>> &out_{}
-                )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(
-                    hls::stream<ap_uint<{}>> &in0_{},
-                    hls::stream<ap_uint<{}>> &weights_{},
-                    hls::stream<ap_uint<{}>> &out_{}
-                    )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_weightstream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-        else:
-            raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
-            )
-
-    def pragmas(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
-        ]
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
-
-        if mem_mode == "const":
-            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
-            # partition for parallel access along the PE dimension (dim 1)
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-            )
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
-            )
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or external,
-                currently no other parameter value is supported!"""
-            )
-
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
-            )
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
-            )
-
-    def get_verilog_top_module_intf_names(self):
-        intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
-        sname = self.hls_sname()
-        if mem_mode == "external":
-            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
-        if mem_mode == "decoupled":
-            # only expose axilite interface if attribute is set
-            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
-            if runtime_writable:
-                intf_names["axilite"] = ["s_axilite"]
-        return intf_names
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
     def code_generation_ipi(self):
         cmd = []
@@ -1108,207 +964,4 @@ def code_generation_ipi(self):
             return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for VectorVectorActivation")
-        return cmd
-
-    def uram_estimation(self):
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        omega = self.calc_wmem()
-        mem_width = Q * W * P
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (
-            (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "const")
-            or (mmode == "external")
-        ):
-            return 0
-        width_multiplier = math.ceil(mem_width / 72)
-        depth_multiplier = math.ceil(omega / 4096)
-        return width_multiplier * depth_multiplier
-
-    def bram_estimation(self):
-        """Calculates resource estimation for BRAM"""
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        omega = self.calc_wmem()
-        mem_width = Q * W * P
-        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
-        # since this is HLS memory, not using the full width of a BRAM
-        # assuming memories up to 128 deep get implemented in LUTs
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (
-            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
-            or (mstyle == "auto" and self.calc_wmem() <= 128)
-            or (mmode == "const" and self.calc_wmem() <= 128)
-            or (mmode == "external")
-        ):
-            return 0
-
-        if mem_width == 1:
-            return math.ceil(omega / 16384)
-        elif mem_width == 2:
-            return math.ceil(omega / 8192)
-        elif mem_width <= 4:
-            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
-        elif mem_width <= 9:
-            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
-        elif mem_width <= 18 or omega > 512:
-            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
-        else:
-            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
-
-    def bram_efficiency_estimation(self):
-        P = self.get_nodeattr("PE")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        omega = self.calc_wmem()
-        bram16_est = self.bram_estimation()
-        if bram16_est == 0:
-            return 1
-        wbits = W * P * omega
-        bram16_est_capacity = bram16_est * 36 * 512
-        return wbits / bram16_est_capacity
-
-    def lut_estimation(self):
-        """Calculates resource estimations for LUTs based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        # determine tdt with input and weight data types
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        # parameters from experiments in paper mentioned above
-        c0 = 300
-        c1 = 1.1
-        c2 = 0
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle == "distributed") or (
-            mmode == "const" and self.calc_wmem() <= 128
-        ):
-            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
-
-        # multiplication
-        res_type = self.get_nodeattr("resType")
-        if res_type == "dsp":
-            mult_luts = 0
-        else:
-            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
-        # adder tree
-        addertree_luts = (W + A) * (2 * Q - 1)
-        # accumulator
-        acc_datatype = self.get_accumulator_datatype()
-        acc_bits = acc_datatype.bitwidth()
-        k_h, k_w = self.get_nodeattr("Kernel")
-        # if accDataType is not set, then it will default to INT32, which would
-        # be a large overestimate in most (if not all) cases. In this scenario,
-        # we would use the minimum accumulator as determined by the data types
-        # bound, derived in https://arxiv.org/abs/2301.13376
-        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
-        acc_bits = min(
-            acc_datatype.bitwidth(),
-            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
-        )
-        acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        # TODO - add 'ram_style_threshold' node attribute
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
-            comp_luts = (2**B - 1) * acc_bits
-
-        return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
-        )
-
-    def dsp_estimation(self):
-        # multiplication
-        P = self.get_nodeattr("PE")
-        res_type = self.get_nodeattr("resType")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        if res_type == "dsp":
-            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
-        else:
-            mult_dsp = 0
-        return int(mult_dsp)
-
-    def get_weightstream_width(self):
-        """Returns weight stream width. Used only in decoupled mode."""
-        if (
-            self.get_nodeattr("mem_mode") == "decoupled"
-            or self.get_nodeattr("mem_mode") == "external"
-        ):
-            simd = self.get_nodeattr("SIMD")
-            pe = self.get_nodeattr("PE")
-            wp = self.get_weight_datatype().bitwidth()
-            w_width = simd * pe * wp
-            return w_width
-        else:
-            return 0
-
-    def get_weightstream_width_padded(self):
-        """Returns weight stream width padded to a multiple of 8. This is required
-        by the AXI Stream spec. Used in decoupled mode."""
-        weight_width = self.get_weightstream_width()
-        return roundup_to_integer_multiple(weight_width, 8)
-
-    def get_op_and_param_counts(self):
-        k_h, k_w = self.get_nodeattr("Kernel")
-        fm = self.get_nodeattr("Channels")
-        dim_h, dim_w = self.get_nodeattr("Dim")
-        weight_bits = self.get_weight_datatype().bitwidth()
-        inp_bits = self.get_input_datatype().bitwidth()
-        num_repetitions = int(dim_h * dim_w)
-        mac_count = k_h * k_w * fm * num_repetitions
-        # cannonicalize op type: highest bitwidth operand first s.t.
-        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
-        bw1 = min(inp_bits, weight_bits)
-        bw2 = max(inp_bits, weight_bits)
-        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
-        weight_param_type = "param_weight_%db" % (weight_bits)
-        weight_count = k_h * k_w * fm
-        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = fm
-            ret_dict[thres_param_type] = thres_count
-        return ret_dict
-
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+        return cmd
\ No newline at end of file
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index d1d61f0ed5..26cd0b74ad 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1280,3 +1280,418 @@ def apply(self, model):
                 graph_modified = True
 
         return (model, graph_modified)
+
+class InferBinaryMatrixVectorActivation(Transformation):
+    """Convert XnorPopcountMatMul layers to
+    MatrixVectorActivation layers. Any immediately following MultiThreshold
+    layers will also be absorbed into the MVTU."""
+
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "XnorPopcountMatMul":
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], (
+                    n.name
+                    + """: First
+                input for xnorpopcount is not Wset to FINN DataType BINARY."""
+                )
+                assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], (
+                    n.name
+                    + """: Second
+                input (weights) for xnorpopcount is not set to FINN DataType BINARY."""
+                )
+                idt = DataType["BINARY"]
+                wdt = DataType["BINARY"]
+                mm_output = n.output[0]
+                W = model.get_initializer(mm_weight)
+                # extract weight shape, note that ONNX and finn-hlslib
+                # make different assumptions about dim order here
+                # ONNX assumes W has (in, out) shape
+                # finn-hlslib assumes W has (out, in) shape
+                mh = int(W.shape[1])
+                mw = int(W.shape[0])
+                # create node with no parallelization first
+                pe = 1
+                simd = 1
+                wmem = mw * mh // (pe * simd)
+                assert mw * mh == wmem * pe * simd, (
+                    n.name
+                    + """: Requirement (MW * MH) divisiable by
+                (WMEM * PE * SIMD) is violated."""
+                )
+                # see if we have any following thresholds
+                consumer = model.find_consumer(mm_output)
+                if consumer is not None and consumer.op_type == "MultiThreshold":
+                    # TODO ensure integer thresholds?
+                    # create MVTU (i.e. including activation)
+                    mt_output = consumer.output[0]
+                    mt_out_shape = model.get_tensor_shape(mt_output)
+                    mt_thres = consumer.input[1]
+                    T = model.get_initializer(mt_thres)
+                    assert T.shape[0] == 1 or T.shape[0] == mh, (
+                        consumer.name
+                        + """: First dimension of
+                    thresholds neither 1 nor MH."""
+                    )
+                    odt = model.get_tensor_datatype(mt_output)
+                    if odt.bitwidth() == 1:
+                        # covers both bipolar and binary
+                        actval = 0
+                    else:
+                        actval = odt.min()
+                    model.set_tensor_shape(mm_input, mm_in_shape)
+                    model.set_tensor_shape(mt_output, mt_out_shape)
+                    # create and insert new MatrixVectorActivation node
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation",
+                        [mm_input, mm_weight, mt_thres],
+                        [mt_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=idt.name,
+                        weightDataType=wdt.name,
+                        outputDataType=odt.name,
+                        ActVal=actval,
+                        binaryXnorMode=1,
+                        noActivation=0,
+                        numInputVectors=list(mm_in_shape[:-1]),
+                        mem_mode=self.mem_mode,
+                        name=n.name,
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+                    graph_modified = True
+                else:
+                    # no activation, matmul only
+                    odt = model.get_tensor_datatype(mm_output)
+                    model.set_tensor_shape(mm_input, mm_in_shape)
+                    model.set_tensor_shape(mm_output, mm_out_shape)
+                    # create and insert new MatrixVectorActivation node
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation",
+                        [mm_input, mm_weight],
+                        [mm_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=idt.name,
+                        weightDataType=wdt.name,
+                        outputDataType=odt.name,
+                        ActVal=0,
+                        binaryXnorMode=1,
+                        noActivation=1,
+                        numInputVectors=list(mm_in_shape[:-1]),
+                        mem_mode=self.mem_mode,
+                        name=n.name,
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+class InferQuantizedMatrixVectorActivation(Transformation):
+    """Convert MatMul layers with quantized inputs and weights to
+    MatrixVectorActivation layers."""
+
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                idt = model.get_tensor_datatype(mm_input)
+                wdt = model.get_tensor_datatype(mm_weight)
+                if idt.is_integer() and wdt.is_integer():
+                    mm_output = n.output[0]
+                    W = model.get_initializer(mm_weight)
+                    # extract weight shape, note that ONNX and finn-hlslib
+                    # make different assumptions about dim order here
+                    # ONNX assumes W has (in, out) shape
+                    # finn-hlslib assumes W has (out, in) shape
+                    mh = int(W.shape[1])
+                    mw = int(W.shape[0])
+                    # create node with no parallelization first
+                    pe = 1
+                    simd = 1
+                    wmem = mw * mh // (pe * simd)
+                    assert mw * mh == wmem * pe * simd, (
+                        n.name
+                        + """: Requirement (MW * MH) divisible by
+                    (WMEM * PE * SIMD) is violated."""
+                    )
+                    # see if we have any following thresholds
+                    consumer = model.find_consumer(mm_output)
+                    if consumer is not None and consumer.op_type == "MultiThreshold":
+                        # TODO ensure integer thresholds?
+                        # create MVTU (i.e. including activation)
+                        mt_output = consumer.output[0]
+                        mt_out_shape = model.get_tensor_shape(mt_output)
+                        mt_thres = consumer.input[1]
+                        T = model.get_initializer(mt_thres)
+                        assert T.shape[0] == 1 or T.shape[0] == mh, (
+                            consumer.name
+                            + """: First dimension of
+                        thresholds neither 1 nor MH."""
+                        )
+                        odt = model.get_tensor_datatype(mt_output)
+                        scale = getCustomOp(consumer).get_nodeattr("out_scale")
+                        actval = getCustomOp(consumer).get_nodeattr("out_bias")
+                        assert int(actval) == actval, (
+                            consumer.name + ": out_bias must be integer for HLS conversion."
+                        )
+                        actval = int(actval)
+                        odt_is_bipolar = odt == DataType["BIPOLAR"]
+                        bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1)
+                        assert scale == 1.0 or bipolar_ok, (
+                            consumer.name + ": out_scale=1 or bipolar output needed for conversion."
+                        )
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mt_output, mt_out_shape)
+                        if bipolar_ok:
+                            # remove bias for bipolar, since
+                            # binary->bipolar is achieved by reinterpretation
+                            actval = 0
+                        # create and insert new MatrixVectorActivation node
+                        new_node = helper.make_node(
+                            "MatrixVectorActivation",
+                            [mm_input, mm_weight, mt_thres],
+                            [mt_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            MW=mw,
+                            MH=mh,
+                            SIMD=simd,
+                            PE=pe,
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=actval,
+                            binaryXnorMode=0,
+                            noActivation=0,
+                            numInputVectors=list(mm_in_shape[:-1]),
+                            mem_mode=self.mem_mode,
+                            name="MatrixVectorActivation_" + n.name,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old nodes
+                        graph.node.remove(n)
+                        graph.node.remove(consumer)
+                        graph_modified = True
+                    else:
+                        # no activation, matmul only
+                        odt = model.get_tensor_datatype(mm_output)
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mm_output, mm_out_shape)
+                        # create and insert new MatrixVectorActivation node
+                        new_node = helper.make_node(
+                            "MatrixVectorActivation",
+                            [mm_input, mm_weight],
+                            [mm_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            MW=mw,
+                            MH=mh,
+                            SIMD=simd,
+                            PE=pe,
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=0,
+                            binaryXnorMode=0,
+                            noActivation=1,
+                            numInputVectors=list(mm_in_shape[:-1]),
+                            mem_mode=self.mem_mode,
+                            name="MatrixVectorActivation_" + n.name,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old node
+                        graph.node.remove(n)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+class InferVectorVectorActivation(Transformation):
+    """Convert MatMul layers with quantized inputs and weights to
+    VectorVectorActivation layers, if the sparsity annotation
+    of the weight matrix indicates that the MatMul layer belongs to
+    a depthwise convolution. Any immediately following MultiThreshold
+    layers will also be absorbed into the VVAU."""
+
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None:
+                sparsity = model.get_tensor_sparsity(n.input[1])
+                try:
+                    k_h, k_w = sparsity["dw"]["kernel_shape"]
+                except KeyError:
+                    raise Exception(
+                        n.name
+                        + """: sparsity annotation doesn't indicate that MatMul
+                        belongs to a depthwise convolution."""
+                    )
+
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                idt = model.get_tensor_datatype(mm_input)
+                wdt = model.get_tensor_datatype(mm_weight)
+                if idt.is_integer() and wdt.is_integer():
+                    mm_output = n.output[0]
+                    W = model.get_initializer(mm_weight)
+                    # infer dense weight tensor from sparse weight matrix
+                    # kernel size (k_h, k_w) which was extracted above and the value of
+                    # the channels is used.
+                    # the weight matrix has a shape of (k_h * k_w * Channels, Channels)
+                    # we need to reverse the creation of the sparse weight matrix
+                    # to achieve a weight tensor of shape (Channels, 1, k_h, k_w)
+                    channels = int(W.shape[1])
+                    # transpose to achieve a shape of (k_h * k_w * Channels, Channels)
+                    W = W.T
+                    # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards
+                    # to (Channels, Channels, k_h, k_w)
+                    W = W.reshape(channels, k_h, k_w, channels)
+                    W = W.transpose(0, 3, 1, 2)
+                    # now we can extract the values using a for loop over the channels
+                    # and fill a zero numpy array in the correct shape
+                    w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32)
+                    for ch in range(channels):
+                        w_tensor[ch][0] = W[ch][ch]
+                    model.set_initializer(mm_weight, w_tensor)
+                    model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w))
+                    # create node with pe=channels as default
+                    pe = channels
+                    # see if we have any following thresholds
+                    consumer = model.find_consumer(mm_output)
+                    if consumer is not None and consumer.op_type == "MultiThreshold":
+                        # create VVAU (i.e. including activation)
+                        mt_output = consumer.output[0]
+                        mt_out_shape = model.get_tensor_shape(mt_output)
+                        mt_thres = consumer.input[1]
+                        T = model.get_initializer(mt_thres)
+                        assert T.shape[0] == 1 or T.shape[0] == channels, (
+                            consumer.name
+                            + """: First dimension of
+                        thresholds neither 1 nor Channels."""
+                        )
+                        odt = model.get_tensor_datatype(mt_output)
+                        scale = getCustomOp(consumer).get_nodeattr("out_scale")
+                        assert scale == 1.0, (
+                            consumer.name + ": out_scale must be equal to 1.0 for HLS conversion."
+                        )
+                        actval = getCustomOp(consumer).get_nodeattr("out_bias")
+                        assert int(actval) == actval, (
+                            consumer.name + ": out_bias must be integer for HLS conversion."
+                        )
+                        actval = int(actval)
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mt_output, mt_out_shape)
+                        # create and insert new VectorVectorActivation node
+                        new_node = helper.make_node(
+                            "VectorVectorActivation",
+                            [mm_input, mm_weight, mt_thres],
+                            [mt_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            resType="lut",
+                            PE=pe,
+                            Dim=[mm_in_shape[1], mm_in_shape[2]],
+                            Channels=channels,
+                            Kernel=[k_h, k_w],
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=actval,
+                            noActivation=0,
+                            name="VectorVectorActivation_" + n.name,
+                            mem_mode=self.mem_mode,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old nodes
+                        graph.node.remove(n)
+                        graph.node.remove(consumer)
+                        graph_modified = True
+                    else:
+                        # no activation, matmul only
+                        odt = model.get_tensor_datatype(mm_output)
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mm_output, mm_out_shape)
+                        # create and insert new VVAU node
+                        new_node = helper.make_node(
+                            "VectorVectorActivation",
+                            [mm_input, mm_weight],
+                            [mm_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            resType="lut",
+                            PE=pe,
+                            Dim=[mm_in_shape[1], mm_in_shape[2]],
+                            Channels=channels,
+                            Kernel=[k_h, k_w],
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=0,
+                            noActivation=1,
+                            name="VectorVectorActivation_" + n.name,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old node
+                        graph.node.remove(n)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
\ No newline at end of file
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 1a182c7f4f..81c5848d57 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -48,12 +48,13 @@ def is_external_input(model, node, i):
     # True only if input is unconnected and has no initializer
     # Only esception is second input of FC layers when mem_mode is external
     node_inst = getCustomOp(node)
+    op_type = node_inst.base_op_type()
     producer = model.find_producer(node.input[i])
     if producer is None:
         if model.get_initializer(node.input[i]) is None:
             return True
         else:
-            if node.op_type == "MatrixVectorActivation":
+            if op_type == "MatrixVectorActivation":
                 if node_inst.get_nodeattr("mem_mode") == "external":
                     return True
     return False
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index ceb2bdb5c9..56e644f2b8 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -150,7 +150,7 @@ def apply(self, model):
                 continue
 
             elif not (
-                node.op_type == "MatrixVectorActivation"
+                node_inst.base_op_type() == "MatrixVectorActivation"
                 and node_inst.get_nodeattr("mem_mode") is not None
                 and node_inst.get_nodeattr("mem_mode") == "external"
             ):
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 81cee8dae4..d0029cb630 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -88,7 +88,7 @@ def apply(self, model):
                         # - if FC and external mem, it could be connected to input 1
                         # - if concat, could be connected to any input
                         if (
-                            consumer.op_type == "MatrixVectorActivation"
+                            n1.base_op_type() == "MatrixVectorActivation"
                             and n1.get_nodeattr("mem_mode") == "external"
                         ) or (consumer.op_type == "StreamingConcat"):
                             # get input idx
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 93e3226b2a..fd546459fa 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -199,7 +199,7 @@ def apply(self, model):
             # attached IODMA
             fc_extw_nodes = list(
                 filter(
-                    lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"]
+                    lambda x: getCustomOp(x).base_op_type() in ["MatrixVectorActivation", "VectorVectorActivation"]
                     and getCustomOp(x).get_nodeattr("mem_mode") == "external"
                     and model.find_producer(x.input[1]) is None,
                     all_nodes,
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 157df46d71..ab5142e4d8 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -103,7 +103,7 @@ def apply(self, model):
                 #    the input is in the list of graph inputs because it has an
                 #    initializer (TODO: fix this with a clean-up transform)
                 if (
-                    first_node.op_type == "MatrixVectorActivation"
+                    getCustomOp(first_node).base_op_type() == "MatrixVectorActivation"
                     and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8")
                     != "external"
                 ):
@@ -117,7 +117,7 @@ def apply(self, model):
                     num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
                     inp_idx = list(first_node.input).index(graph_in_name)
                     if inp_idx > 0:
-                        if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1:
+                        if getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and inp_idx == 1:
                             stream_width = int(custom_op.get_weightstream_width())
                         elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1:
                             stream_width = int(custom_op.get_instream_width())
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index d5c2d8f2b5..e66236bf39 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -282,7 +282,7 @@ def apply(self, model):
             dataflow_model = ModelWrapper(dataflow_model_filename)
             rt_layer_ind = 0
             for node in dataflow_model.graph.node:
-                if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]:
+                if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch":
                     node_inst = getCustomOp(node)
                     is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
                     if is_rt_weights == 1:
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 989eb62a88..193e6e8b42 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -62,7 +62,7 @@ def collect_ip_dirs(model, ipstitch_path):
         ), """The directory that should
         contain the generated ip blocks doesn't exist."""
         ip_dirs += [ip_dir_value]
-        if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]:
+        if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch":
             if node_inst.get_nodeattr("mem_mode") == "decoupled":
                 need_memstreamer = True
     ip_dirs += [ipstitch_path + "/ip"]
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 11ffc965b6..84a8084832 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -174,7 +174,7 @@ def apply(self, model):
                     continue
                 if fifo_cons is None:
                     continue
-                if fifo_cons.op_type != "MatrixVectorActivation":
+                if getCustomOp(fifo_cons).base_op_type() != "MatrixVectorActivation":
                     continue
                 op_inst = getCustomOp(node)
                 depth = op_inst.get_nodeattr("depth")
@@ -281,7 +281,7 @@ def apply(self, model):
             node.set_nodeattr("inFIFODepths", ifd)
             node.set_nodeattr("outFIFODepths", ofd)
 
-            if node.onnx_node.op_type in extw_optypes:
+            if getCustomOp(node).base_op_type() in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -422,7 +422,7 @@ def apply(self, model):
                 # (removed setting of node FIFO size attributes to 0 here)
                 # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type in extw_optypes:
+                if getCustomOp(node).base_op_type() in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 4045a28e16..7b65023abc 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -125,7 +125,7 @@ def apply(self, model):
                 continue
             op_type = node.op_type
             node_inst = getCustomOp(node)
-            if op_type == "MatrixVectorActivation":
+            if node_inst.base_op_type() == "MatrixVectorActivation":
                 max_simd = node_inst.get_nodeattr("MW")
                 max_pe = node_inst.get_nodeattr("MH")
                 node_inst.set_nodeattr("PE", 1)
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index b80ef76a19..bd283855e3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -52,6 +52,9 @@
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
+from qonnx.transformation.infer_shapes import InferShapes
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -135,6 +138,87 @@ def prepare_inputs(input_tensor, idt, wdt):
         return {"inp": input_tensor}
 
 
+# activation: None or DataType
+@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2, 1])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [-1, 2, 1])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [16])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [16])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, mw))
+    if act is None:
+        # no activation, produce accumulators
+        T = None
+        tdt = None
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
+    else:
+        odt = act
+        (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+        # provide non-decreasing thresholds
+        T = np.sort(T, axis=1)
+        # generate thresholds for activation
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + mw) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    # prepare input data
+    input_dict = prepare_inputs(x, idt, wdt)
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        # convert inputs to binary and use xnorpopcountmatmul
+        y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
+    else:
+        y = np.matmul(x, W)
+    if T is not None:
+        # y = multithreshold(y, T)
+        if act == DataType["BIPOLAR"]:
+            # binary to bipolar
+            # y = 2 * y - 1
+            y = multithreshold(y, T, 2, -1)
+        else:
+            # signed offset
+            # y += act.min()
+            y = multithreshold(y, T, 1, act.min())
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    y_produced = y_produced.reshape(y_expected.shape)
+
+    assert (y_produced == y_expected).all(), "cppsim hw-op failed"
+
+
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
@@ -154,7 +238,9 @@ def prepare_inputs(input_tensor, idt, wdt):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+    if idt == DataType["BIPOLAR"] and wdt != DataType["BIPOLAR"] or idt != DataType["BIPOLAR"] and wdt == DataType["BIPOLAR"]:
+        pytest.skip("Bipolar activations/weights only supported in MVU if both operands are bipolar")
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -195,6 +281,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # lookup op_type in registry of CustomOps
         inst = getCustomOp(node)
         inst.set_nodeattr("mem_mode", mem_mode)
+        inst.set_nodeattr("preferred_impl_style", "hls")
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("cppsim"))
     model = model.transform(PrepareCppSim())
     model = model.transform(CompileCppSim())
@@ -220,7 +308,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
     y_produced = y_produced.reshape(y_expected.shape)
 
-    assert (y_produced == y_expected).all(), "cppsim failed"
+    assert (y_produced == y_expected).all(), "cppsim hls-op failed"
 
 
 # mem_mode: const or decoupled
@@ -239,10 +327,14 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+# Backend
+@pytest.mark.parametrize("backend", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend):
+    if backend == "rtl" and act is not None:
+        pytest.skip("RTL MVU doesn't support embedded thresholding functionality.")
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -283,6 +375,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # lookup op_type in registry of CustomOps
         inst = getCustomOp(node)
         inst.set_nodeattr("mem_mode", mem_mode)
+        inst.set_nodeattr("preferred_impl_style", backend)
 
     # prepare input data
     input_dict = prepare_inputs(x, idt, wdt)
@@ -303,6 +396,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     y_expected = y.reshape(oshape)
     # TODO split up into several dependent tests -- need to check how this
     # works for parametrized tests...
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
@@ -312,7 +406,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
-    assert "MatrixVectorActivation_0" in hls_synt_res_est
+    if backend == "hls":
+        assert "MatrixVectorActivation_hls_0" in hls_synt_res_est
+    else:
+        assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est
 
     node = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     inst = getCustomOp(node)
@@ -339,10 +436,12 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [128])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
+# Backend
+@pytest.mark.parametrize("backend", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
-    mem_mode, idt, wdt, act, nf, sf, mw, mh
+    mem_mode, idt, wdt, act, nf, sf, mw, mh, backend
 ):
     if nf == -1:
         nf = mh
@@ -404,6 +503,7 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     y_expected = y.reshape(oshape)
     # TODO split up into several dependent tests -- need to check how this
     # works for parametrized tests...
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
@@ -413,7 +513,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
-    assert "MatrixVectorActivation_0" in hls_synt_res_est
+    if backend == "hls":
+        assert "MatrixVectorActivation_hls_0" in hls_synt_res_est
+    else:
+        assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est
 
     node = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     inst = getCustomOp(node)
@@ -440,9 +543,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
 @pytest.mark.parametrize("mw", [32])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [32])
+# Backend
+@pytest.mark.parametrize("backend", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend):
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -469,6 +574,7 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh
         inst.set_nodeattr("mem_mode", mem_mode)
     total_fold = nf * sf
     exp_total_cycles = total_fold + 10
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))