Skip to content

Commit

Permalink
add conv kernel
Browse files Browse the repository at this point in the history
squash add conv kernel
  • Loading branch information
jorendumoulin committed Oct 23, 2024
1 parent 34f4224 commit 074c582
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 2 deletions.
2 changes: 0 additions & 2 deletions benchmarks/roofline/genbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
import pathlib
from io import StringIO
from pprint import pprint

import pandas as pd
from xdsl.builder import Builder, ImplicitBuilder
Expand Down Expand Up @@ -185,4 +184,3 @@ def generate_dense_benchmark(m, n, k, add_c) -> SNAXBenchmark:

with open("output/index.md", "w") as file:
file.write(markdown_table)

43 changes: 43 additions & 0 deletions kernels/conv/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Courtesy of Federico Ficarelli

.DEFAULT_GOAL := all

include ../../runtime/snax-gemmx.rules
include ../../runtime/Makefile.rules

TESTS =
TESTS += conv.x

MLIRPREPROCFLAGS = --linalg-generalize-named-ops
MLIRPREPROCFLAGS += --mlir-print-op-generic
MLIRPREPROCFLAGS += --mlir-print-local-scope

%.preprocfinal.mlir: %.mlir
$(MLIROPT) $(MLIRPREPROCFLAGS) -o $@ $<


SNAXOPTFLAGS = -p convert-linalg-to-kernel,insert-accfg-op{accelerator=snax_gemmx},dispatch-kernels,convert-linalg-to-stream,fuse-streaming-regions,stream-bufferize,snax-bufferize,alloc-to-global,set-memory-space,set-memory-layout,realize-memref-casts,insert-sync-barrier,dispatch-regions{nb_cores=3},convert-stream-to-snax-stream,convert-linalg-to-accfg,convert-accfg-to-csr,snax-copy-to-dma,memref-to-snax,snax-to-func,clear-memory-space

CFLAGS += -std=gnu11
CFLAGS += -Wall -Wextra

data.c data.h:
$(PYTHON) gendata.py

%.x: %.o main.o data.o
$(LD) $(LDFLAGS) $^ -o $@

sim_%: %
rm -fr ./logs/
$(VLTSIM) $<

RUN = $(addprefix run_, $(TESTS))
$(RUN): run_%: sim_%
mv logs $(subst sim_,,$<).logs

all: $(TESTS)

allrun: $(RUN)

clean:
rm -fr *.ll12 *.x *.o *.logs/ logs/ data.h data.c
6 changes: 6 additions & 0 deletions kernels/conv/conv.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
func.func @conv(%arg0 : tensor<1x18x18x16xi8>, %arg1 : tensor<16x3x3x16xi8>) -> tensor<1x16x16x16xi32> {
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<1x16x16x16xi32>
%conv = linalg.conv_2d_nhwc_fhwc_q ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x18x18x16xi8>, tensor<16x3x3x16xi8>, i32, i32) outs(%0 : tensor<1x16x16x16xi32>) -> tensor<1x16x16x16xi32>
func.return %conv : tensor<1x16x16x16xi32>
}
34 changes: 34 additions & 0 deletions kernels/conv/gendata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# simple script to generate inputs and expected outputs for simple_matmult

import numpy as np

from util.gendata import create_data, create_header

if __name__ == "__main__":
# Reset random seed for reproducible behavior

np.random.seed(0)

I_size = [1, 18, 18, 16]
W_size = [16, 3, 3, 16]
O_size = [1, 16, 16, 16]

# D = A.B + C
low_bound = -128
high_bound = 127
I = np.random.randint(low_bound, high_bound, size=I_size, dtype=np.dtype("int8"))
W = np.random.randint(low_bound, high_bound, size=W_size, dtype=np.dtype("int8"))

# TODO:: calculate output ass well
O = np.zeros(shape=O_size, dtype=np.int32)
O_golden = np.zeros(shape=O_size, dtype=np.int32)

variables = {
"I": I,
"W": W,
"O": O,
"O_golden": O_golden,
}

create_header("data.h", {}, variables)
create_data("data.c", variables)
56 changes: 56 additions & 0 deletions kernels/conv/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "stdint.h"

#include "data.h"
#include "memref.h"
#include "snax_rt.h"

/*
* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
* Interested users, might want to look at:
*
* /sw/snRuntime/api
* /target/snitch_cluster/sw/runtime/rtl/src
* /target/snitch_cluster/sw/runtime/common
* */
#include <snrt.h>

// Kernel provided via external definition
void _mlir_ciface_conv(FourDMemrefI32_t *o, FourDMemrefI8_t *i,
FourDMemrefI8_t *w);

int main() {
{

// Create memref objects for data stored in L3
FourDMemrefI8_t memrefI;
memrefI.data = &I;
memrefI.aligned_data = memrefI.data;

FourDMemrefI8_t memrefW;
memrefW.data = &W;
memrefW.aligned_data = memrefW.data;

FourDMemrefI32_t memrefO;

// allocate zero row in tcdm
snrt_l1alloc(256);

(void)snrt_mcycle();

_mlir_ciface_conv(&memrefO, &memrefI, &memrefW);

snrt_cluster_hw_barrier();

(void)snrt_mcycle();

// Correctness check -
// from this point on only core 0 is required to be alive.
int thiscore = snrt_cluster_core_idx();
if (thiscore != 0)
return 0;

// do not check errors for now, golden model not available

return 0;
}
}

0 comments on commit 074c582

Please sign in to comment.