add conv kernel

squash add conv kernel
KULeuven-MICAS · Oct 23, 2024 · 074c582 · 074c582
1 parent 34f4224
commit 074c582
Show file tree

Hide file tree

Showing 5 changed files with 139 additions and 2 deletions.
diff --git a/benchmarks/roofline/genbenchmark.py b/benchmarks/roofline/genbenchmark.py
@@ -2,7 +2,6 @@
 import json
 import pathlib
 from io import StringIO
-from pprint import pprint
 
 import pandas as pd
 from xdsl.builder import Builder, ImplicitBuilder
@@ -185,4 +184,3 @@ def generate_dense_benchmark(m, n, k, add_c) -> SNAXBenchmark:
 
     with open("output/index.md", "w") as file:
         file.write(markdown_table)
-
diff --git a/kernels/conv/Makefile b/kernels/conv/Makefile
@@ -0,0 +1,43 @@
+# Courtesy of Federico Ficarelli
+
+.DEFAULT_GOAL := all
+
+include ../../runtime/snax-gemmx.rules
+include ../../runtime/Makefile.rules
+
+TESTS =
+TESTS += conv.x
+
+MLIRPREPROCFLAGS = --linalg-generalize-named-ops
+MLIRPREPROCFLAGS += --mlir-print-op-generic
+MLIRPREPROCFLAGS += --mlir-print-local-scope
+
+%.preprocfinal.mlir: %.mlir
+	$(MLIROPT) $(MLIRPREPROCFLAGS) -o $@ $<
+
+
+SNAXOPTFLAGS = -p convert-linalg-to-kernel,insert-accfg-op{accelerator=snax_gemmx},dispatch-kernels,convert-linalg-to-stream,fuse-streaming-regions,stream-bufferize,snax-bufferize,alloc-to-global,set-memory-space,set-memory-layout,realize-memref-casts,insert-sync-barrier,dispatch-regions{nb_cores=3},convert-stream-to-snax-stream,convert-linalg-to-accfg,convert-accfg-to-csr,snax-copy-to-dma,memref-to-snax,snax-to-func,clear-memory-space
+
+CFLAGS += -std=gnu11
+CFLAGS += -Wall -Wextra
+
+data.c data.h:
+	$(PYTHON) gendata.py
+
+%.x: %.o main.o data.o
+	$(LD) $(LDFLAGS) $^ -o $@
+
+sim_%: %
+	rm -fr ./logs/
+	$(VLTSIM) $<
+
+RUN = $(addprefix run_, $(TESTS))
+$(RUN): run_%: sim_%
+	mv logs $(subst sim_,,$<).logs
+
+all: $(TESTS)
+
+allrun: $(RUN)
+
+clean:
+	rm -fr *.ll12 *.x *.o *.logs/ logs/ data.h data.c
diff --git a/kernels/conv/conv.mlir b/kernels/conv/conv.mlir
@@ -0,0 +1,6 @@
+func.func @conv(%arg0 : tensor<1x18x18x16xi8>, %arg1 : tensor<16x3x3x16xi8>) -> tensor<1x16x16x16xi32> {
+  %c0_i32 = arith.constant 0 : i32
+  %0 = tensor.empty() : tensor<1x16x16x16xi32>
+  %conv = linalg.conv_2d_nhwc_fhwc_q ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x18x18x16xi8>, tensor<16x3x3x16xi8>, i32, i32) outs(%0 : tensor<1x16x16x16xi32>) -> tensor<1x16x16x16xi32>
+  func.return %conv : tensor<1x16x16x16xi32>
+}
diff --git a/kernels/conv/gendata.py b/kernels/conv/gendata.py
@@ -0,0 +1,34 @@
+# simple script to generate inputs and expected outputs for simple_matmult
+
+import numpy as np
+
+from util.gendata import create_data, create_header
+
+if __name__ == "__main__":
+    # Reset random seed for reproducible behavior
+
+    np.random.seed(0)
+
+    I_size = [1, 18, 18, 16]
+    W_size = [16, 3, 3, 16]
+    O_size = [1, 16, 16, 16]
+
+    # D = A.B + C
+    low_bound = -128
+    high_bound = 127
+    I = np.random.randint(low_bound, high_bound, size=I_size, dtype=np.dtype("int8"))
+    W = np.random.randint(low_bound, high_bound, size=W_size, dtype=np.dtype("int8"))
+
+    # TODO:: calculate output ass well
+    O = np.zeros(shape=O_size, dtype=np.int32)
+    O_golden = np.zeros(shape=O_size, dtype=np.int32)
+
+    variables = {
+        "I": I,
+        "W": W,
+        "O": O,
+        "O_golden": O_golden,
+    }
+
+    create_header("data.h", {}, variables)
+    create_data("data.c", variables)
diff --git a/kernels/conv/main.c b/kernels/conv/main.c
@@ -0,0 +1,56 @@
+#include "stdint.h"
+
+#include "data.h"
+#include "memref.h"
+#include "snax_rt.h"
+
+/*
+ * These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
+ * Interested users, might want to look at:
+ *
+ * /sw/snRuntime/api
+ * /target/snitch_cluster/sw/runtime/rtl/src
+ * /target/snitch_cluster/sw/runtime/common
+ * */
+#include <snrt.h>
+
+// Kernel provided via external definition
+void _mlir_ciface_conv(FourDMemrefI32_t *o, FourDMemrefI8_t *i,
+                       FourDMemrefI8_t *w);
+
+int main() {
+  {
+
+    // Create memref objects for data stored in L3
+    FourDMemrefI8_t memrefI;
+    memrefI.data = &I;
+    memrefI.aligned_data = memrefI.data;
+
+    FourDMemrefI8_t memrefW;
+    memrefW.data = &W;
+    memrefW.aligned_data = memrefW.data;
+
+    FourDMemrefI32_t memrefO;
+
+    // allocate zero row in tcdm
+    snrt_l1alloc(256);
+
+    (void)snrt_mcycle();
+
+    _mlir_ciface_conv(&memrefO, &memrefI, &memrefW);
+
+    snrt_cluster_hw_barrier();
+
+    (void)snrt_mcycle();
+
+    // Correctness check -
+    // from this point on only core 0 is required to be alive.
+    int thiscore = snrt_cluster_core_idx();
+    if (thiscore != 0)
+      return 0;
+
+    // do not check errors for now, golden model not available
+
+    return 0;
+  }
+}