lf-lang · petervdonovan · Dec 30, 2022 · Dec 31, 2022 · Dec 31, 2022 · Dec 31, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
+.bloop/
 .bsp/
+.idea/
 .metals/
+.scala-build/
 .vscode/
 project/
 target/

diff --git a/flexpret b/flexpret
diff --git a/programs/HelloWorld/hello.c b/programs/HelloWorld/hello.c
@@ -34,4 +34,4 @@ int main2() {
 
 int main3() {
     _fp_print(43);
-}
+}
diff --git a/programs/benchmarks/noc/latency_aligned/Makefile b/programs/benchmarks/noc/latency_aligned/Makefile
@@ -0,0 +1,10 @@
+build:
+	riscv_compile.sh ispm noc_latency_aligned.c
+
+clean:
+	riscv_clean.sh
+
+
+rebuild: clean build
+
+PHONY: build clean rebuild
diff --git a/programs/benchmarks/noc/latency_aligned/align.h b/programs/benchmarks/noc/latency_aligned/align.h
@@ -0,0 +1,34 @@
+#define WAIT_FOR_NEXT_ZERO_MOD_1024(id) \
+        "li t0, 1014\n\t"                                                                          \
+        "li a0, 1\n\t"                                                                             \
+        "li a1, 2\n\t"                                                                             \
+        "li a2, 3\n\t"                                                                             \
+        "li a3, 4\n\t"                                                                             \
+        "li a4, 5\n\t"                                                                             \
+        "li a5, 6\n\t"                                                                             \
+        "li t6, 7\n\t"                                                                             \
+        "rdcycle t1\n\t"                                                                           \
+        "andi t1, t1, 7\n\t"                                                                       \
+        "beq t1, t6, LOOP" #id "\n\t"                                                              \
+        "beq t1, a5, LOOP" #id "\n\t"                                                              \
+        "beq t1, a4, LOOP" #id "\n\t"                                                              \
+        "beq t1, a3, LOOP" #id "\n\t"                                                              \
+        "beq t1, a2, LOOP" #id "\n\t"                                                              \
+        "beq t1, a1, LOOP" #id "\n\t"                                                              \
+        "beq t1, a0, LOOP" #id "\n\t"                                                              \
+        "beq t1, x0, LOOP" #id "\n\t"                                                              \
+        /* This entire loop is 8 cycles long, so the value of t1 upon exiting is t0 plus a      */ \
+        /* number in the range [0, 7]                                                           */ \
+        "LOOP" #id ":\n\t"                                                                         \
+        "nop\n\t"  /* Delay so that loop length is a power of 2 */                                 \
+        "nop\n\t"                                                                                  \
+        "nop\n\t"                                                                                  \
+        "rdcycle t1\n\t"                                                                           \
+        "andi t1, t1, 1023\n\t"                                                                    \
+        "blt t1, t0, LOOP" #id "\n\t" /* Cost of 3 cycles when taken, 1 otherwise; see page 37 https://www2.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-181.pdf */ \
+        "nop\n\t"                                                                                  \
+        "nop\n\t"                                                                                  \
+        "nop\n\t"                                                                                  \
+        "nop\n\t"                                                                                  \
+        "nop\n\t"                                                                                  \
+        "nop\n\t"
diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c
@@ -0,0 +1,80 @@
+/**
+ * This program explores the absolute minimum amount of time that it can take to send one word and
+ * write it into a register on another core, when under the most favorable circumstances,
+ * and when controlling relative timing and optimizing the assembly.
+ */
+#include <stdint.h>
+#include <flexpret_io.h>
+#include <flexpret_noc.h>
+#include <stdlib.h>
+
+#include "align.h"
+
+#define N 100
+
+static int main_of(uint32_t core);
+
+static int send_main(uint32_t receiver);
+static int receive_main(uint32_t sender);
+
+int main() {
+    unsigned long coreid = read_csr(CSR_COREID);
+    srand(coreid);
+    if (coreid == 0) for (int i = 0; i < 10; i++) send_main(1);
+    if (coreid == 1) for (int i = 0; i < 10; i++) receive_main(0);
+}
+
+static int send_main(uint32_t receiver) {
+    asm volatile (
+        "li t4, 0x40000000\n\t"
+        WAIT_FOR_NEXT_ZERO_MOD_1024(send)  // clobber "a" registers, as well as t0, t1, t6
+        // like noc_send, but without blocking
+        "li t5, 0x1\n\t"  // noc destination
+        "sw t5, 8(t4)\n\t"
+        "li t5, 0x08\n\t"
+        "sw t5, 4(t4)\n\t"
+        "nop\n\t"
+        "nop\n\t"
+        "li t5, 42\n\t"  // Set noc data to 42
+        "sw t5, 8(t4)\n\t" // FIXME: Data must be written first? Why? Is it Hardware Bug?
+        "li t5, 0x04\n\t"
+        "sw t5, 4(t4)\n\t"
+    );
+}
+
+static int receive_main(uint32_t sender) {
+    asm volatile (
+        WAIT_FOR_NEXT_ZERO_MOD_1024(receive)
+        // "nop\n\t"  // The 9-cycle read loop is aligned optimally when the number of nops here is zero mod 9
+        // "nop\n\t"
+        // "nop\n\t"
+        // "nop\n\t"
+        // "nop\n\t"
+        // "nop\n\t"
+        // "nop\n\t"
+        // "nop\n\t"
+        // "nop\n\t"
+        "li t4, 0x40000000\n\t"  // wishbone base address
+        // FIXME: Why does this loop have to go through one iteration extra the first time around, compared to the number of iterations that it makes thereafter?
+        "CHECK_IF_RECEIVED_YET:\n\t"
+        // Sadly, this whole sequence -- store, wait, read, mask, beq -- must be in the loop. In particular, if the store is factored out, the read doesn't work, even though we are storing the same thing each time.
+        "sw x0, 0(t4)\n\t"  // Write the address of NoC CSR to Wishbone read address
+        "nop\n\t"
+        "nop\n\t"
+        "lw t5, 12(t4)\n\t"  // Read NoC CSR
+        "andi t5, t5, 2\n\t"
+        "beq x0, t5, CHECK_IF_RECEIVED_YET\n\t"
+        "li t5, 4\n\t"  // Write the address of NoC data to Wishbone read address
+        "sw t5, 0(t4)\n\t"
+        "nop\n\t"
+        "nop\n\t"
+        "lw t5, 12(t4)\n\t"  // Read NoC data
+        "rdcycle t3\n\t"
+        "andi t3, t3, 1023\n\t"
+        "li t0, 0xbaaabaaa\n\t"
+        "csrw 0x51e, t0\n\t"
+        "csrw 0x51e, t3\n\t"
+        "csrw 0x51e, t0\n\t"
+        "csrw 0x51e, t5\n\t"
+    );
+}
diff --git a/programs/benchmarks/noc/latency_random_sparse_send/Makefile b/programs/benchmarks/noc/latency_random_sparse_send/Makefile
@@ -0,0 +1,10 @@
+build:
+	riscv_compile.sh ispm noc_latency_random_sparse_send.c
+
+clean:
+	riscv_clean.sh
+
+
+rebuild: clean build
+
+PHONY: build clean rebuild
diff --git a/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c b/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c
@@ -0,0 +1,50 @@
+#include <stdint.h>
+#include <flexpret_io.h>
+#include <flexpret_noc.h>
+#include <stdlib.h>
+
+#define N 100
+// 1 << LOG2_OF_A_LONG_TIME should be much greater than the number of cycles required to run
+// one iteration of the benchmark. I think it takes less than 512 cycles to run one iteration
+// of the benchmark.
+#define LOG2_OF_A_LONG_TIME 11
+
+static int main_of(uint32_t core);
+
+int main() {
+    unsigned long coreid = read_csr(CSR_COREID);
+    srand(coreid);
+    main_of(coreid);
+}
+
+static int send_main(uint32_t receiver) {
+    for (uint32_t i = 0; i < N; i++) {
+        uint32_t min_delay = 1 << LOG2_OF_A_LONG_TIME;
+        uint32_t additional_delay = rand() & ((1 << LOG2_OF_A_LONG_TIME) - 1);
+        unsigned long end_time = rdcycle() + min_delay + additional_delay;
+        while (rdcycle() < end_time) {}
+        unsigned long t0 = rdcycle(); // benchmark start
+        noc_send(receiver, t0);
+    }
+}
+
+static int receive_main(uint32_t sender) {
+    for (uint32_t i = 0; i < N; i++) {
+        uint32_t t0 = noc_receive();
+        uint32_t t1 = rdcycle(); // benchmark end
+        _fp_print((sender + 1) * 1000000 + t1 - t0);
+    }
+}
+
+static int send_receive(uint32_t partner, int first) {
+    first ? send_main(partner) : receive_main(partner);
+    !first ? send_main(partner) : receive_main(partner);
+}
+
+static int main_of(uint32_t core) {
+    int big = core & 2;
+    int odd  = core & 1;
+    send_receive((core + 1) & 3, !odd);
+    send_receive((core + 2) & 3, !big);
+    send_receive((core + 3) & 3, !odd);
+}
diff --git a/programs/noc/LowLevelInterface/Makefile b/programs/noc/LowLevelInterface/Makefile
@@ -0,0 +1,10 @@
+build:
+	riscv_compile.sh ispm low_level_interface_noc.c
+
+clean:
+	riscv_clean.sh
+
+
+rebuild: clean build
+
+PHONY: build clean rebuild
diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h
@@ -0,0 +1,58 @@
+#ifndef ASM_UTILS_H
+#define ASM_UTILS_H
+
+#define TOKENPASTE2__(x, y) x ## y
+#define TOKENPASTE1__(x, y) TOKENPASTE2__(x, y)
+#define TOKENPASTE(x, y) TOKENPASTE1(x, y)
+
+/**
+ * @brief Pure assembly version of _fp_print. Executes in 4 cycles.
+ */
+#define FP_PRINT_ASM(reg, clobber0) \
+    "li " #clobber0 ", 0xbaaabaaa\n\t"                                                             \
+    "csrw 0x51e, " #clobber0 "\n\t"                                                                \
+    "csrw 0x51e, " #reg "\n\t"
+
+#define TRUE_MACRO(case_true, case_false) case_true
+#define FALSE_MACRO(case_true, case_false) case_false
+
+#define REPEAT1(x) x
+#define REPEAT2(x) x x
+#define REPEAT3(x) x x x
+#define REPEAT4(x) REPEAT2(REPEAT2(x))
+#define REPEAT5(x) REPEAT2(x) REPEAT3(x)
+#define REPEAT7(x) REPEAT2(x) REPEAT5(x)
+#define REPEAT11(x) REPEAT7(x) REPEAT4(x)
+#define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x)))
+
+#define COUNT_TO_20(MACRO, param0, param1, param2) \
+    MACRO(param0, param1, param2, 0) \
+    MACRO(param0, param1, param2, 1) \
+    MACRO(param0, param1, param2, 2) \
+    MACRO(param0, param1, param2, 3) \
+    MACRO(param0, param1, param2, 4) \
+    MACRO(param0, param1, param2, 5) \
+    MACRO(param0, param1, param2, 6) \
+    MACRO(param0, param1, param2, 7) \
+    MACRO(param0, param1, param2, 8) \
+    MACRO(param0, param1, param2, 9) \
+    MACRO(param0, param1, param2, 10) \
+    MACRO(param0, param1, param2, 11) \
+    MACRO(param0, param1, param2, 12) \
+    MACRO(param0, param1, param2, 13) \
+    MACRO(param0, param1, param2, 14) \
+    MACRO(param0, param1, param2, 15) \
+    MACRO(param0, param1, param2, 16) \
+    MACRO(param0, param1, param2, 17) \
+    MACRO(param0, param1, param2, 18) \
+    MACRO(param0, param1, param2, 19)
+
+#define MUL4(in_reg, out_reg)                                                                      \
+    "slli " #out_reg ", " #in_reg ", 2\n\t"
+
+#define MUL5(in_reg, out_reg)                                                                      \
+    MUL4(in_reg, out_reg)                                                                          \
+    "add " #out_reg ", " #out_reg ", " #in_reg "\n\t"
+
+
+#endif // ASM_UTILS_H