Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update accfg benchmarks #300

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ jobs:
path: benchmarks/${{ matrix.kernel }}/output
strategy:
matrix:
kernel: [dense_matmul]
kernel: [dense_matmul, tiled_matmul]
15 changes: 3 additions & 12 deletions benchmarks/tiled_matmul/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

.DEFAULT_GOAL := all

include ../../runtime/snax-streamer-gemm.rules
include ../../runtime/snax-gemmx.rules
include ../../runtime/Makefile.rules

TESTS += generated.x
Expand All @@ -26,13 +26,7 @@ ifdef ACCFG_BOTH
ACCFGOPT=accfg-dedup,accfg-config-overlap,
endif

SNAXOPTFLAGS = -p insert-accfg-op{accelerator=snax_gemm},convert-linalg-to-kernel,dispatch-kernels,set-memory-space,set-memory-layout,realize-memref-casts,${REMOVE_MEMREF_COPY}insert-sync-barrier,reuse-memref-allocs,test-add-mcycle-around-loop,snax-lower-mcycle,dispatch-regions,convert-linalg-to-stream,convert-stream-to-snax-stream,convert-linalg-to-accfg,snax-copy-to-dma,memref-to-snax,snax-to-func,clear-memory-space,function-constant-pinning,mlir-opt{executable=mlir-opt\ generic=true\ arguments="-cse,-canonicalize,-allow-unregistered-dialect,-mlir-print-op-generic"},${ACCFGOPT}convert-accfg-to-csr,


GEN_DATA_OPTS += --m=${SIZE_M}
GEN_DATA_OPTS += --n=${SIZE_N}
GEN_DATA_OPTS += --k=${SIZE_K}

SNAXOPTFLAGS = -p convert-linalg-to-kernel,insert-accfg-op{accelerator=snax_gemmx},dispatch-kernels,convert-linalg-to-stream,fuse-streaming-regions,snax-bufferize,alloc-to-global,set-memory-space,set-memory-layout,realize-memref-casts,insert-sync-barrier,dispatch-regions{nb_cores=2},convert-stream-to-snax-stream,convert-linalg-to-accfg,snax-copy-to-dma,memref-to-snax,snax-to-func,clear-memory-space,function-constant-pinning,mlir-opt{executable=mlir-opt\ generic=true\ arguments="-cse,-canonicalize,-allow-unregistered-dialect,-mlir-print-op-generic"},${ACCFGOPT}convert-accfg-to-csr,

CFLAGS += -std=gnu11
CFLAGS += -Wall -Wextra
Expand All @@ -42,10 +36,7 @@ ifdef NO_CHECK
CFLAGS += -DNO_CHECK
endif

data.c data.h:
$(PYTHON) gendata.py ${GEN_DATA_OPTS}

%.x: %.o main.o data.o
%.x: %.o main.o
$(LD) $(LDFLAGS) $^ -o $@

sim_%: %
Expand Down
124 changes: 71 additions & 53 deletions benchmarks/tiled_matmul/genbenchmark.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,97 @@
import pathlib
from io import StringIO

from xdsl.builder import ImplicitBuilder
from xdsl.dialects import arith, builtin, func, linalg, transform
from xdsl.dialects.builtin import i8
from xdsl.ir import Block, Region
import numpy as np
from xdsl.builder import Builder
from xdsl.dialects import builtin, transform
from xdsl.dialects.arith import ConstantOp
from xdsl.dialects.builtin import (
DenseIntOrFPElementsAttr,
ModuleOp,
TensorType,
UnitAttr,
i8,
i32,
)
from xdsl.dialects.func import FuncOp, ReturnOp
from xdsl.dialects.linalg import QuantizedMatmulOp
from xdsl.dialects.tensor import EmptyOp
from xdsl.parser import DenseArrayBase, IntegerType
from xdsl.printer import Printer

from util.snax_benchmark import SNAXBenchmark


def create_tiled_matrix_multiply(k, m, n, tiling_factors):
"""
Generate IR in the form of:
```
builtin.module {
func.func @streamer_matmul(%arg0 : memref<16x16xi8>, %arg1 : memref<16x16xi8,
strided<[1, 16]>>, %arg2 : memref<16x16xi32>) {
%0 = arith.constant 0 : i32
linalg.quantized_matmul ins(%arg0, %arg1, %0, %0 : memref<16x16xi8>,
memref<16x16xi8, strided<[1, 16]>>, i32, i32)
outs(%arg2 : memref<16x16xi32>)
func.return
}
"transform.sequence"() <{"failure_propagation_mode" = 1 : i32,
"operandSegmentSizes" = array<i32: 0, 0>}> ({
^0(%arg0 : !transform.any_op, %arg1 : !transform.op<"linalg.quantized_matmul">):
"transform.yield"() : () -> ()
}) : () -> ()
}
```
"""

def get_2d_memref_type(typ, dim_one, dim_two, transpose=False):
layout = (
builtin.StridedLayoutAttr([1, dim_one]) if transpose else builtin.NoneAttr()
)
return builtin.MemRefType(typ, [dim_one, dim_two], layout=layout)
# Define Variables For Program:

input_types = [
get_2d_memref_type(i8, k, m),
get_2d_memref_type(i8, m, n, transpose=True),
get_2d_memref_type(builtin.i32, k, n),
]
a_type = TensorType(i8, (m, k))
a_vals = np.random.randint(-127, 128, (m, k))

b_type = TensorType(i8, (k, n))
b_vals = np.random.randint(-127, 128, (k, n))

output_type = TensorType(i32, (m, n))
golden_vals = a_vals @ b_vals

res_types = [output_type] * 2

# Define Program:
@Builder.implicit_region([])
def func_body(_) -> None:
# Declare constants
a = ConstantOp(
DenseIntOrFPElementsAttr.from_list(a_type, a_vals.flatten().tolist())
)
b = ConstantOp(
DenseIntOrFPElementsAttr.from_list(b_type, b_vals.flatten().tolist())
)
golden = ConstantOp(
DenseIntOrFPElementsAttr.from_list(
output_type, golden_vals.flatten().tolist()
)
)

b = Block(arg_types=(input_types))
c0 = ConstantOp.from_int_and_width(0, 32)

with ImplicitBuilder(b) as (arg0, arg1, arg2):
c0 = arith.ConstantOp.from_int_and_width(0, 32)
linalg.QuantizedMatmulOp([arg0, arg1, c0.result, c0.result], [arg2])
func.ReturnOp()
# Declare result tensor type
empty_tensor = EmptyOp([], output_type)

region = Region(b)
# Specify the operation
result = QuantizedMatmulOp(
(a.result, b.result, c0.result, c0.result), empty_tensor.results
)

function = func.FuncOp.from_region("streamer_matmul", input_types, [], region)
# Return both the computed result and the golden output
ReturnOp(result, golden)

failurePropagationMode = builtin.IntegerAttr(1, builtin.IntegerType(32))
function = FuncOp.from_region("snax_main", [], res_types, func_body)

input_types_t = [
# Manually speficy tiling sequence
transform_inputs = [
transform.AnyOpType(),
transform.OperationType("linalg.quantized_matmul"),
]
b_t = Block(arg_types=input_types_t)

with ImplicitBuilder(b_t) as (arg0, arg1):
(transform.TileOp(arg1, [], tiling_factors, scalable_sizes=tiling_factors))
transform.YieldOp()

region_t = Region(b_t)
@Builder.implicit_region(transform_inputs)
def tiling_sequence(args):
transform.TileOp(
target=args[1],
dynamic_sizes=[],
scalable_sizes=DenseArrayBase.create_dense_int(IntegerType(1), [0, 0]),
static_sizes=DenseArrayBase.create_dense_int(IntegerType(64), [8, 8]),
)

transform_sequence = transform.SequenceOp(failurePropagationMode, [], [], region_t)
transform.YieldOp()

module = builtin.ModuleOp([function, transform_sequence])
function_type = builtin.FunctionType.from_lists(transform_inputs, [])
transform_sequence = transform.NamedSequenceOp(
"__transform_main", function_type, tiling_sequence
)

return module
return ModuleOp(
[function, transform_sequence], {"transform.with_named_sequence": UnitAttr()}
)


def write_module_to_file(module, file):
Expand Down
186 changes: 46 additions & 140 deletions benchmarks/tiled_matmul/main.c
Original file line number Diff line number Diff line change
@@ -1,151 +1,57 @@
#include "stdint.h"

#include "data.h"
#include "memref.h"
#include "snax_rt.h"

/*
* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
* Interested users, might want to look at:
*
* /sw/snRuntime/api
* /target/snitch_cluster/sw/runtime/rtl/src
* /target/snitch_cluster/sw/runtime/common
* */
#include "stdint.h"
#include <snrt.h>

/* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
* Interested users, might want to look at:
*
* /target/snitch_cluster/sw/snax/streamer-gemm/include"
* /target/snitch_cluster/sw/snax/mac/include"
*
* */
#include "snax-streamer-gemm-lib.h"

#define tileSize 8
#define meshRow 8
#define meshCol 8

uint8_t Batch = 1;

/* M_param and N_param can be set to 1 for tiled versions, but not for simple
version. 2 always works. however, it will impact performance significantly as
computation cost doubles. For benchmarks, set to 1 */
uint8_t M_param = 2;
uint8_t K_param = K_size / tileSize;
uint8_t N_param = 2;

// Extracted from datagen.py in snitch_cluster repo
uint32_t strideInnermostA = 256;
uint32_t strideInnermostB = 256;
uint32_t strideInnermostC = 256;
uint32_t ldA = 512;
uint32_t ldB = 512;
uint32_t ldC = 512;
uint32_t strideA = 0;
uint32_t strideB = 0;
uint32_t strideC = 0;

// Kernel provided via external definition
void _mlir_ciface_streamer_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
TwoDMemrefI32_t *c);

void _mlir_ciface_snax_gemm(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, int32_t zpa,
int32_t zpb, TwoDMemrefI32_t *c) {
{
printf("Executing snax_gemm with a=%p, b=%p, c=%p \n", a->aligned_data,
b->aligned_data, c->aligned_data);
int local_delta_a = (int)a->aligned_data - (int)snrt_l1_next();
int local_delta_b = (int)b->aligned_data - (int)snrt_l1_next();
int local_delta_c = (int)c->aligned_data - (int)snrt_l1_next();

set_streamer_csr(K_param, N_param, M_param, strideInnermostA, ldA, 8,
strideInnermostB, ldB, 8, strideInnermostC, ldC, 32,
local_delta_a, local_delta_b, local_delta_c);
set_streamer_start();
set_block_gemm_csr(K_param, N_param, M_param, 0);

snrt_mcycle();

set_block_gemm_start();

printf("Waiting for snax_gemm\n");

wait_streamer_gemm();

snrt_mcycle();

printf("Finished executing snax_gemm\n");
}
}
void _mlir_ciface_snax_main(TwoDMemrefI32_t *results);

int main() {
{

// Create memref objects for data stored in L3
TwoDMemrefI8_t memrefA;
memrefA.data = &A;
memrefA.aligned_data = memrefA.data;
// Shape and Stride need to be defined for dynamic case
memrefA.shape[0] = N_size;
memrefA.shape[1] = K_size;
memrefA.stride[0] = K_size;
memrefA.stride[1] = 1;
memrefA.offset = 0;

TwoDMemrefI8_t memrefB;
memrefB.data = &B;
memrefB.aligned_data = memrefB.data;
// Shape and Stride need to be defined for dynamic case
memrefB.shape[0] = K_size;
memrefB.shape[1] = M_size;
memrefB.stride[0] = 1;
memrefB.stride[1] = K_size;
memrefB.offset = 0;
printf("M_size: %d, K_size: %d, N_size: %d\n", M_size, K_size, N_size);

TwoDMemrefI32_t memrefC;
memrefC.data = &C;
memrefC.aligned_data = memrefC.data;
// Shape and Stride need to be defined for dynamic case
memrefC.shape[0] = N_size;
memrefC.shape[1] = M_size;
memrefC.stride[0] = M_size;
memrefC.stride[1] = 1;
memrefC.offset = 0;

_mlir_ciface_streamer_matmul(&memrefA, &memrefB, &memrefC);

snrt_cluster_hw_barrier();

// Correctness check -
// from this point on only core 0 is required to be alive.
int thiscore = snrt_cluster_core_idx();
if (thiscore != 0)
return 0;

#ifdef NO_CHECK
// No correctness check =
// Always finish as if nothing happened

TwoDMemrefI32_t results[2];

TwoDMemrefI32_t *golden, *computed;

golden = &results[0];
computed = &results[1];

// allocate zero row in tcdm
snrt_l1alloc(256);

(void)snrt_mcycle();
snrt_cluster_hw_barrier();

_mlir_ciface_snax_main(results);

snrt_cluster_hw_barrier();
(void)snrt_mcycle();

// Correctness check
// from this point on only core 0 is required to be alive.
int thiscore = snrt_cluster_core_idx();
if (thiscore != 0)
return 0;
#endif
int nerr = 0;

for (int i = 0; i < M_size * N_size; i++) {
{
int32_t error = memrefC.aligned_data[i] - C_golden[i];
// printf("%d) %d -> %d\n", i, (int32_t)memrefC.aligned_data[i],
// (int32_t)C_golden[i]);
if (error != 0)
nerr += 1;
}
}

// insert mcycle to show fault in trace
if (nerr != 0)
snrt_mcycle();
int total_results = 1;
for (int i = 0; i < 2; i++)
total_results *= computed->shape[i];

printf("Checking %d results...\n", total_results);

int nerr = 0;

return nerr;
for (int i = 0; i < total_results; i++) {

if (golden->aligned_data[i] != computed->aligned_data[i]) {
// printf("(%d) %d -> %d\n", i, golden->aligned_data[i],
// computed->aligned_data[i]);
nerr++;
}
}

printf("Finished, nb errors: %d\n", nerr);

if (nerr > 0)
return 1;
else
return 0;
}
Loading