diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index be4cf417bc..a7f220daa9 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -60,8 +60,8 @@ def res_estimation_complete(model): res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: - op_type = node.op_type inst = registry.getCustomOp(node) + op_type = inst.base_op_type() if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation": orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 54ba7e4ea1..11107ccb64 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -569,7 +569,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: - model = model.transform(ApplyConfig(cfg.folding_config_file)) + model = model.transform( + ApplyConfig( + cfg.folding_config_file, + node_filter=lambda x: x.op_type == "StreamingFIFO", + ) + ) # extract the final configuration and save it as json hw_attrs = [ diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 188f45273c..ebb5ce98da 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -50,6 +50,8 @@ from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VectorVectorActivation_hls custom_op = dict() @@ -75,3 +77,5 @@ custom_op["Thresholding_hls"] = Thresholding_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls +custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls +custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py new file mode 100644 index 0000000000..2ad9fefc07 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -0,0 +1,522 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend): + """Corresponds to finn-hlslib MatrixVectorActivation_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. + if var == "ipgen": + SIMD = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + condition = SIMD >= (MW / 1024) + msg = ( + f"HLS synthesis of MatrixVectorActivation requires: " + f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " + f"and MW={MW} for node: {self.onnx_node.name}." + ) + assert condition, msg + mem_mode = self.get_nodeattr("mem_mode") + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = np.prod(numInputVectors) + self.code_gen_dict["$DEFINES$"] = [ + """#define MW1 {}\n #define MH1 {}\n + #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n + #define TMEM1 {}\n #define numReps {}""".format( + self.get_nodeattr("MW"), + self.get_nodeattr("MH"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + self.calc_tmem(), + numReps, + ) + ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Stream_Batch + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + # add resource pragma for thresholds if set + if ram_style_thresholds == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") + ) + elif ram_style_thresholds == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") + ) + elif ram_style_thresholds == "auto": + # no pragma needed + pass + else: + raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + self.reset_rtlsim(sim) + self.toggle_clk(sim) + if mem_mode in ["external", "decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py new file mode 100644 index 0000000000..51de49f1c7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -0,0 +1,372 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) +from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend + +class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend): + """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VectorVectorActivation.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + dim_h, dim_w = self.get_nodeattr("Dim") + numReps = 1 * dim_h * dim_w + k_h, k_w = self.get_nodeattr("Kernel") + innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + + self.code_gen_dict["$DEFINES$"] = [ + """#define Channels1 {}\n #define InnerProdDim {}\n + #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("Channels"), + innerProdDim, + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + numReps, + ) + ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 846894d85c..d8210fd684 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -472,5 +472,5 @@ def get_ap_int_max_w(self): instream = self.get_instream_width() outstream = self.get_outstream_width() ret = max([instream, outstream]) - assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret + assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret return ret diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 4fed8ed4b5..bc59c69192 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -31,7 +31,7 @@ import subprocess import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self): def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 + reset_rtlsim(sim) def toggle_clk(self, sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 + toggle_clk(sim) def hls_sname(self): """Get the naming convention used by Vitis HLS for stream signals @@ -604,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict): trace_file=trace_file, sname=sname, liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + do_reset=True, ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 6699340cac..fd5751ef7d 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -31,20 +31,32 @@ import os import textwrap import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +import qonnx.custom_op.general.xnorpopcount as xp +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, + qonnx_make_model ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +import qonnx.core.data_layout as DataLayout +import finn.core.onnx_exec as oxe +from qonnx.transformation.infer_shapes import InferShapes +import onnx.numpy_helper as np_helper +from qonnx.transformation.general import GiveUniqueNodeNames + # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -54,9 +66,8 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" +class MatrixVectorActivation(HWCustomOp): + """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -67,7 +78,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -122,10 +133,14 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - } + "preferred_impl_style" : ("s", False, "hls", {"hls", "rtl"}), + } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def base_op_type(self): + return "MatrixVectorActivation" + def calc_wmem(self): """Calculates and returns WMEM.""" mw = self.get_nodeattr("MW") @@ -165,6 +180,61 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + assert ( + i_bits <= 9 + ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" + in_width = i_bits * self.get_nodeattr("SIMD") + return in_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + assert ( + wp <= 8 + ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" + w_width = pe * simd * wp + return w_width + else: + return 0 + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -385,6 +455,25 @@ def dsp_estimation(self): else: mult_dsp = 0 return int(mult_dsp) +# # TODO: fix DSP estimations --> depends on fpga_part +# def dsp_estimation(self): +# # multiplication +# # mvu_8sx9 (DSP58): ceil(SIMD/3) +# # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) +# # mvu_8sx8u (DSP48): ceil(PE/2) +# # mvu_lut: 0 +# P = self.get_nodeattr("PE") +# res_type = self.get_nodeattr("resType") +# Q = self.get_nodeattr("SIMD") +# wdt = self.get_weight_datatype() +# W = wdt.bitwidth() +# idt = self.get_input_datatype() +# A = idt.bitwidth() +# if res_type == "dsp": +# mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling +# else: +# mult_dsp = 0 +# return int(mult_dsp) def get_exp_cycles(self): pe = self.get_nodeattr("PE") @@ -397,6 +486,27 @@ def get_exp_cycles(self): exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) +# # TODO: fix exp_cycles estimations --> depends on fpga_part and clk +# def get_exp_cycles(self): +# # mvu_8sx9 (DSP58): +# # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) +# # + MW/SIMD * MH/PE +# # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): +# # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) +# # + MW/SIMD * MH/PE +# # mvu_lut: +# # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) +# # + MW/SIMD * MH/PE +# pe = self.get_nodeattr("PE") +# simd = self.get_nodeattr("SIMD") +# num_inp_vec = self.get_nodeattr("numInputVectors") +# mh = self.get_nodeattr("MH") +# mw = self.get_nodeattr("MW") +# # since mmv != 1 is not supported yet, we set mmv for now to 1 +# mmv = 1 +# exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv +# return int(exp_cycles) + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" # when performing FIFO insertion on an FC layer with ext weights, the ind @@ -450,17 +560,6 @@ def get_weightstream_width_padded(self): weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) - def get_ap_int_max_w(self): - # base class impl (max of inp/out stream widths) - max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream - weightstream = self.get_weightstream_width() - # single PE weight entry - weight_bits = self.get_weight_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - single_pe_w = simd * weight_bits - return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") mh = self.get_nodeattr("MH") @@ -505,82 +604,6 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" - - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 and MW % SIMD == 0 - * for bipolar {-1,+1} weights, convert to binary {0, 1} - * interleave rows between PEs - * reshape into (1, PE, WMEM, SIMD) and return - """ - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - mw, - mh, - ), """Weights matrix doesn't - have expected shape (mw, mh)""" - assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - # start by transposing the original weight matrix, since ONNX and - # finn-hlslib use different assumptions - # ONNX uses (in_features, out_features) and matmul(x, W) - # finn-hlslib uses (out_features, in_features) and matmul(W, x) - ret = orig_weight_matrix.T - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - # interleave rows between PEs and reshape - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim - ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) - return ret - def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, input data types, and size of dot product""" @@ -728,6 +751,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -905,402 +965,68 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. - if var == "ipgen": - SIMD = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - condition = SIMD >= (MW / 1024) - msg = ( - f"HLS synthesis of MatrixVectorActivation requires: " - f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " - f"and MW={MW} for node: {self.onnx_node.name}." - ) - assert condition, msg - mem_mode = self.get_nodeattr("mem_mode") - numInputVectors = list(self.get_nodeattr("numInputVectors")) - numReps = np.prod(numInputVectors) - self.code_gen_dict["$DEFINES$"] = [ - """#define MW1 {}\n #define MH1 {}\n - #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n - #define TMEM1 {}\n #define numReps {}""".format( - self.get_nodeattr("MW"), - self.get_nodeattr("MH"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - self.calc_wmem(), - self.calc_tmem(), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Stream_Batch - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): mem_mode = self.get_nodeattr("mem_mode") - ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + mvau_w = np_helper.to_array(mvau_w_init) + # Matrix multiplication + if self.get_nodeattr("binaryXnorMode"): + # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode) + result = xp.xnorpopcountmatmul(in_act, mvau_w) + elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"): + result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2) else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - # add resource pragma for thresholds if set - if ram_style_thresholds == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") - ) - elif ram_style_thresholds == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") - ) - elif ram_style_thresholds == "auto": - # no pragma needed - pass - else: - raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + result = np.matmul(in_act, mvau_w) + # Thresholding if noActivation==0 + if self.get_nodeattr("noActivation") == 0: + mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + mvau_thr = np_helper.to_array(mvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + result = multithreshold(result, mvau_thr, out_scale, out_bias) + + context[node.output[0]] = result def code_generation_ipi(self): cmd = [] @@ -1324,22 +1050,51 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + is_rtl_op = self.__class__.__name__ == "MatrixVectorActivation_rtl" + if is_rtl_op: + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + else: + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " @@ -1393,7 +1148,8 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -1404,60 +1160,32 @@ def code_generation_ipi(self): cmd.append("assign_bd_address") cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + if is_rtl_op and mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + else: + # base class impl sufficient for const/external modes + return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") - return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - in_features = self.get_nodeattr("MW") - out_features = self.get_nodeattr("MH") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_inp_vec = self.get_nodeattr("numInputVectors") - num_repetitions = int(np.prod(num_inp_vec)) - mac_count = in_features * out_features * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = in_features * out_features - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return cmd \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index bd5bb75f1d..2168474298 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -38,17 +38,21 @@ roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +import onnx.numpy_helper as np_helper +import qonnx.custom_op.general.xnorpopcount as xp +from qonnx.custom_op.general.multithreshold import multithreshold -class VectorVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + +class VectorVectorActivation(HWCustomOp): + """Abstraction layer for HW implementation of VectorVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -100,128 +104,66 @@ def get_nodeattr_types(self): # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar "binaryXnorMode": ("i", False, 0, {0, 1}), + # Backend implementation for layer + # hls -- Vivado HLS + # rtl -- (System)Verilog + "preferred_impl_style": ("s", False, "hls", {"hls", "rtl"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs - def minimize_accumulator_width(self, model): - """Minimize the accumulator bit width according to the weight values, - input data types, and size of dot product""" - weights = model.get_initializer(self.onnx_node.input[1]) - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - # put weights into the shape expected by calculate_matvec_accumulator_range - weights = weights.reshape(fm, k_h * k_w).transpose() - # since in the calculation the values of the weight matrix are used, - # for the bipolar case they need to be converted to bipolar - if self.get_nodeattr("binaryXnorMode"): - weights = 2 * weights - 1 - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None - idt = self.get_input_datatype() - - (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) - # if runtime-writeable weights, then the values of the weights can - # change and we need to use the worst-case values from the datatypes - if self.get_nodeattr("runtime_writeable_weights"): - wdt = self.get_weight_datatype() - lower_worst = wdt.min() * np.ones_like(weights) - lower_range = calculate_matvec_accumulator_range(lower_worst, idt) - upper_worst = wdt.max() * np.ones_like(weights) - upper_range = calculate_matvec_accumulator_range(upper_worst, idt) - acc_min = min(min(lower_range), min(upper_range)) - acc_max = max(max(upper_range), max(upper_range)) + def base_op_type(self): + return "VectorVectorActivation" - # if the thresholds can be used to determine range, then adjust the range - # according to the known values of the thresholds - if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - # set threshold datatype (and accumulator datatype implicitly) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - # clip threshold values - if max_threshold > acc_max or min_threshold < acc_min: - warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) - thresholds = np.clip(thresholds, acc_min, acc_max) - model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - acc_min = min(min_threshold, acc_min) - acc_max = max(max_threshold, acc_max) + def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels): + W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32) + for ch in range(channels): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + W_matmul = W_conv.transpose(0, 2, 3, 1) + W_matmul = W_matmul.reshape(channels, channels * k_h * k_w) + W_matmul = W_matmul.T + return W_matmul - # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 - if acc_min >= 0: - acc_bit_width = np.log2(acc_max + 1) - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"UINT{acc_bit_width}"] - # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= - # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + (_, dim_h, dim_w, _) = in_act.shape + (k_h, k_w) = self.get_nodeattr("Kernel") + channels = self.get_nodeattr("Channels") + # Reshape input activations in right format + in_act = in_act.reshape(1, dim_h, dim_w, channels, k_h*k_w) + in_act = in_act.transpose(0, 1, 2, 4, 3) + in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w) + # Reshape + vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + vvau_w = np_helper.to_array(vvau_w_init) + vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) + + if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR": + result = np.matmul(in_act, vvau_w_onnx) + result = (result + k_h*k_w) / 2 else: - _acc_max = max(-acc_min, 1 + acc_max) - acc_bit_width = np.log2(_acc_max) + 1 - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"INT{acc_bit_width}"] - - # if activation, assert that the thresholds can be expressed with adt - if thresholds is not None: - assert np.vectorize(adt.allowed)( - threshold_tensor - ).all(), "Thresholds in %s can't be expressed with type %s" % ( - self.onnx_node.name, - str(adt), - ) - - # if no activation, output and accumulator datatypes are the same - if self.get_nodeattr("noActivation"): - # if this is the last node in the graph, then ensure the datatype is - # divisibly by 8 bits - if model.find_direct_successors(self.onnx_node) is None: - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] - # for no-activation nodes, output dt = acc dt - self.set_nodeattr("outputDataType", adt.name) - self.set_nodeattr("accDataType", adt.name) - - return DataType[self.get_nodeattr("accDataType")] - - def minimize_weight_bit_width(self, model): - """Minimize the bit width based on the values of the weights""" - if not self.get_nodeattr("runtime_writeable_weights"): - weights = model.get_initializer(self.onnx_node.input[1]) - w_min = weights.min() - w_max = weights.max() - if w_min < 0: - if abs(w_min) > w_max: - wdt = DataType.get_smallest_possible(w_min) - else: - wdt = DataType.get_smallest_possible(-w_max - 1) - else: - wdt = DataType.get_smallest_possible(w_max) - self.set_nodeattr("weightDataType", wdt.name) - return DataType[self.get_nodeattr("weightDataType")] - - def calc_wmem(self): - """Calculates and returns WMEM.""" - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = (k_h * k_w * ch // pe) // simd - return wmem + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 - else: - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - return ch // pe + if self.get_nodeattr("noActivation") == 0: + vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + vvau_thr = np_helper.to_array(vvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + # NHWC to NCHW for multithreshold node + result = result.transpose((0,3,1,2)) + result = multithreshold(result, vvau_thr, out_scale, out_bias) + # NCHW to NHWC + result = result.transpose((0,2,3,1)) + + # for i in range(self.get_nodeattr("Channels")): + context[node.output[0]] = result + def verify_node(self): + pass + def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() return super().make_const_shape_op(oshape) @@ -241,9 +183,6 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -266,12 +205,32 @@ def get_instream_width(self, ind=0): pe = self.get_nodeattr("PE") in_width = i_bits * simd * pe return in_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = simd * pe * wp + return w_width + else: + return 0 def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") dim_h, dim_w = self.get_nodeattr("Dim") @@ -320,89 +279,303 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_exp_cycles(self): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") + def calc_wmem(self): + """Calculates and returns WMEM.""" ch = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") k_h, k_w = self.get_nodeattr("Kernel") - # currently FINN supports for vvau a batch size of 1 - batch_size = 1 - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv - return int(exp_cycles) + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = (k_h * k_w * ch // pe) // simd + return wmem - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + return ch // pe - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const") + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier - return ret + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - ch, - 1, - k_h, - k_w, - ), """Weights matrix doesn't - have expected shape (channels, 1, kernel_size, kernel_size)""" - ret = orig_weight_matrix - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - ret = ret.reshape(ch, k_h * k_w) - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - ret = ret.reshape(1, pe, wmem, simd) - return ret + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for bipolar weights&inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = int(np.prod(self.get_nodeattr("Kernel"))) + D_out = self.get_nodeattr("Channels") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + acc_bits = acc_datatype.bitwidth() + k_h, k_w = self.get_nodeattr("Kernel") + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + # TODO - add 'ram_style_threshold' node attribute + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") + # currently FINN supports for vvau a batch size of 1 + batch_size = 1 + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv + return int(exp_cycles) + + def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" + weights = model.get_initializer(self.onnx_node.input[1]) + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if self.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the thresholds can be used to determine range, then adjust the range + # according to the known values of the thresholds + if thresholds is not None: + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # set threshold datatype (and accumulator datatype implicitly) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + # clip threshold values + if max_threshold > acc_max or min_threshold < acc_min: + warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) + thresholds = np.clip(thresholds, acc_min, acc_max) + model.set_initializer(self.onnx_node.input[2], thresholds) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + acc_min = min(min_threshold, acc_min) + acc_max = max(max_threshold, acc_max) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] + + # if activation, assert that the thresholds can be expressed with adt + if thresholds is not None: + assert np.vectorize(adt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(adt), + ) + + # if no activation, output and accumulator datatypes are the same + if self.get_nodeattr("noActivation"): + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) + + return DataType[self.get_nodeattr("accDataType")] + + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") tmem = self.calc_tmem() @@ -446,6 +619,29 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k_h, + k_w, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + ret = ret.reshape(ch, k_h * k_w) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, simd) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -623,384 +819,44 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for VectorVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - dim_h, dim_w = self.get_nodeattr("Dim") - num_w_reps = dim_h * dim_w - - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - if self.calc_tmem() != 0: - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - dim_h, dim_w = self.get_nodeattr("Dim") - numReps = 1 * dim_h * dim_w + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") - innerProdDim = k_h * k_w - mem_mode = self.get_nodeattr("mem_mode") - - self.code_gen_dict["$DEFINES$"] = [ - """#define Channels1 {}\n #define InnerProdDim {}\n - #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( - self.get_nodeattr("Channels"), - innerProdDim, - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + fm = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = k_h * k_w * fm + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = fm + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - "Vector_Vector_Activate_Stream_Batch", - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) def code_generation_ipi(self): cmd = [] @@ -1108,207 +964,4 @@ def code_generation_ipi(self): return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") - return cmd - - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const") - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM""" - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # since this is HLS memory, not using the full width of a BRAM - # assuming memories up to 128 deep get implemented in LUTs - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mstyle == "auto" and self.calc_wmem() <= 128) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - - def bram_efficiency_estimation(self): - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * P * omega - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - acc_bits = acc_datatype.bitwidth() - k_h, k_w = self.get_nodeattr("Kernel") - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - # TODO - add 'ram_style_threshold' node attribute - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - w_width = simd * pe * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_op_and_param_counts(self): - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_repetitions = int(dim_h * dim_w) - mac_count = k_h * k_w * fm * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = k_h * k_w * fm - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = fm - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return cmd \ No newline at end of file diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index d1d61f0ed5..26cd0b74ad 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1280,3 +1280,418 @@ def apply(self, model): graph_modified = True return (model, graph_modified) + +class InferBinaryMatrixVectorActivation(Transformation): + """Convert XnorPopcountMatMul layers to + MatrixVectorActivation layers. Any immediately following MultiThreshold + layers will also be absorbed into the MVTU.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "XnorPopcountMatMul": + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( + n.name + + """: First + input for xnorpopcount is not Wset to FINN DataType BINARY.""" + ) + assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( + n.name + + """: Second + input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" + ) + idt = DataType["BINARY"] + wdt = DataType["BINARY"] + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisiable by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + if odt.bitwidth() == 1: + # covers both bipolar and binary + actval = 0 + else: + actval = odt.min() + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=1, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=1, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + +class InferQuantizedMatrixVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + MatrixVectorActivation layers.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisible by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + odt_is_bipolar = odt == DataType["BIPOLAR"] + bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) + assert scale == 1.0 or bipolar_ok, ( + consumer.name + ": out_scale=1 or bipolar output needed for conversion." + ) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + if bipolar_ok: + # remove bias for bipolar, since + # binary->bipolar is achieved by reinterpretation + actval = 0 + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=0, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name="MatrixVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=0, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name="MatrixVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + +class InferVectorVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + VectorVectorActivation layers, if the sparsity annotation + of the weight matrix indicates that the MatMul layer belongs to + a depthwise convolution. Any immediately following MultiThreshold + layers will also be absorbed into the VVAU.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: + sparsity = model.get_tensor_sparsity(n.input[1]) + try: + k_h, k_w = sparsity["dw"]["kernel_shape"] + except KeyError: + raise Exception( + n.name + + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) + + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # infer dense weight tensor from sparse weight matrix + # kernel size (k_h, k_w) which was extracted above and the value of + # the channels is used. + # the weight matrix has a shape of (k_h * k_w * Channels, Channels) + # we need to reverse the creation of the sparse weight matrix + # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) + channels = int(W.shape[1]) + # transpose to achieve a shape of (k_h * k_w * Channels, Channels) + W = W.T + # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards + # to (Channels, Channels, k_h, k_w) + W = W.reshape(channels, k_h, k_w, channels) + W = W.transpose(0, 3, 1, 2) + # now we can extract the values using a for loop over the channels + # and fill a zero numpy array in the correct shape + w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) + for ch in range(channels): + w_tensor[ch][0] = W[ch][ch] + model.set_initializer(mm_weight, w_tensor) + model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) + # create node with pe=channels as default + pe = channels + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # create VVAU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == channels, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor Channels.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + assert scale == 1.0, ( + consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." + ) + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new VectorVectorActivation node + new_node = helper.make_node( + "VectorVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + resType="lut", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + noActivation=0, + name="VectorVectorActivation_" + n.name, + mem_mode=self.mem_mode, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new VVAU node + new_node = helper.make_node( + "VectorVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + resType="lut", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + noActivation=1, + name="VectorVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) \ No newline at end of file diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 1a182c7f4f..81c5848d57 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -48,12 +48,13 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) + op_type = node_inst.base_op_type() producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if node.op_type == "MatrixVectorActivation": + if op_type == "MatrixVectorActivation": if node_inst.get_nodeattr("mem_mode") == "external": return True return False diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index ceb2bdb5c9..56e644f2b8 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -150,7 +150,7 @@ def apply(self, model): continue elif not ( - node.op_type == "MatrixVectorActivation" + node_inst.base_op_type() == "MatrixVectorActivation" and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 81cee8dae4..d0029cb630 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -88,7 +88,7 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - consumer.op_type == "MatrixVectorActivation" + n1.base_op_type() == "MatrixVectorActivation" and n1.get_nodeattr("mem_mode") == "external" ) or (consumer.op_type == "StreamingConcat"): # get input idx diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 93e3226b2a..fd546459fa 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,7 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"] + lambda x: getCustomOp(x).base_op_type() in ["MatrixVectorActivation", "VectorVectorActivation"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 157df46d71..ab5142e4d8 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -103,7 +103,7 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - first_node.op_type == "MatrixVectorActivation" + getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): @@ -117,7 +117,7 @@ def apply(self, model): num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1: + if getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index d5c2d8f2b5..e66236bf39 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -282,7 +282,7 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 989eb62a88..193e6e8b42 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -62,7 +62,7 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": if node_inst.get_nodeattr("mem_mode") == "decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 11ffc965b6..84a8084832 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -174,7 +174,7 @@ def apply(self, model): continue if fifo_cons is None: continue - if fifo_cons.op_type != "MatrixVectorActivation": + if getCustomOp(fifo_cons).base_op_type() != "MatrixVectorActivation": continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -281,7 +281,7 @@ def apply(self, model): node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if node.onnx_node.op_type in extw_optypes: + if getCustomOp(node).base_op_type() in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -422,7 +422,7 @@ def apply(self, model): # (removed setting of node FIFO size attributes to 0 here) # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type in extw_optypes: + if getCustomOp(node).base_op_type() in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 4045a28e16..7b65023abc 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -125,7 +125,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MatrixVectorActivation": + if node_inst.base_op_type() == "MatrixVectorActivation": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index b80ef76a19..bd283855e3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -52,6 +52,9 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames +from qonnx.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -135,6 +138,87 @@ def prepare_inputs(input_tensor, idt, wdt): return {"inp": input_tensor} +# activation: None or DataType +@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2, 1]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [-1, 2, 1]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [16]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + # generate input data + x = gen_finn_dt_tensor(idt, (1, mw)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + # prepare input data + input_dict = prepare_inputs(x, idt, wdt) + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # convert inputs to binary and use xnorpopcountmatmul + y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) + else: + y = np.matmul(x, W) + if T is not None: + # y = multithreshold(y, T) + if act == DataType["BIPOLAR"]: + # binary to bipolar + # y = 2 * y - 1 + y = multithreshold(y, T, 2, -1) + else: + # signed offset + # y += act.min() + y = multithreshold(y, T, 1, act.min()) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim hw-op failed" + + # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @@ -154,7 +238,9 @@ def prepare_inputs(input_tensor, idt, wdt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if idt == DataType["BIPOLAR"] and wdt != DataType["BIPOLAR"] or idt != DataType["BIPOLAR"] and wdt == DataType["BIPOLAR"]: + pytest.skip("Bipolar activations/weights only supported in MVU if both operands are bipolar") if nf == -1: nf = mh if sf == -1: @@ -195,6 +281,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -220,7 +308,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), "cppsim hls-op failed" # mem_mode: const or decoupled @@ -239,10 +327,14 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [16]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [16]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): + if backend == "rtl" and act is not None: + pytest.skip("RTL MVU doesn't support embedded thresholding functionality.") if nf == -1: nf = mh if sf == -1: @@ -283,6 +375,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", backend) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -303,6 +396,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -312,7 +406,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if backend == "hls": + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] inst = getCustomOp(node) @@ -339,10 +436,12 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh + mem_mode, idt, wdt, act, nf, sf, mw, mh, backend ): if nf == -1: nf = mh @@ -404,6 +503,7 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -413,7 +513,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if backend == "hls": + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] inst = getCustomOp(node) @@ -440,9 +543,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): if nf == -1: nf = mh if sf == -1: @@ -469,6 +574,7 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh inst.set_nodeattr("mem_mode", mem_mode) total_fold = nf * sf exp_total_cycles = total_fold + 10 + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5))