Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NoC experiments #9

Draft
wants to merge 34 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fdf8d34
Tentative start on NoC benchmarks.
petervdonovan Dec 30, 2022
1cff46f
Measure 11 cycles of latency by cheating.
petervdonovan Dec 31, 2022
841e500
store->nop->load -> wrong WB read. Bug?
petervdonovan Dec 31, 2022
f9afb86
Actually 35 cycles of latency it seems.
petervdonovan Dec 31, 2022
ec82902
Adjust and comment on noc_latency_aligned.
petervdonovan Dec 31, 2022
6abf115
Experiment with the NoC interface.
petervdonovan Jan 2, 2023
828d100
More tinkering.
petervdonovan Jan 2, 2023
93e1631
Get a basic test working in simulation.
petervdonovan Jan 3, 2023
7e59e25
Failed attempt at synchronization.
petervdonovan Jan 3, 2023
d77fd4d
Successful attempt at synchronization.
petervdonovan Jan 3, 2023
d65c22f
Factor more assembly out into macros.
petervdonovan Jan 3, 2023
8eefda3
First draft of the sender side of the protocol.
petervdonovan Jan 4, 2023
89b8d65
Initial attempt at a batch communication.
petervdonovan Jan 4, 2023
085bcd8
Refactor the assembly a bit.
petervdonovan Jan 4, 2023
5ecc536
More assembly refactoring.
petervdonovan Jan 5, 2023
614a56f
First draft of receive words macro.
petervdonovan Jan 5, 2023
8a49d64
Receive a sequence of words correctly.
petervdonovan Jan 5, 2023
b9ee3ad
Send packets of length up to 64.
petervdonovan Jan 5, 2023
132d4c7
Add C API for read_n_words_and_print.
petervdonovan Jan 5, 2023
a8d065d
Add C API for broadcast_count.
petervdonovan Jan 5, 2023
fa062b2
Get broadcast to work from each core in turn.
petervdonovan Jan 5, 2023
5fea13b
Start extending the protocol.
petervdonovan Jan 6, 2023
7ba4017
Make small modifications.
petervdonovan Jan 6, 2023
8b863f9
Get the extended protocol to work properly.
petervdonovan Jan 6, 2023
309dad6
Update programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c
petervdonovan Jan 20, 2023
b8fdadb
This sends 1023 words in 5867 cycles.
petervdonovan Jan 6, 2023
8e70867
Optimize out a SYNC5.
petervdonovan Jan 7, 2023
d5983a3
Bugfix; move header-only lib to flexpret.
petervdonovan Jan 7, 2023
4c64694
Start creating a BroadcastMemory program.
petervdonovan Feb 3, 2023
b18fd08
Assembly generation "hello world".
petervdonovan Feb 3, 2023
23d3659
Start porting assembly to rvg.
petervdonovan Feb 4, 2023
6827fcc
Top-level definitions parse for BroadcastCount.
petervdonovan Feb 6, 2023
9bf0027
BroadcastCount assembly is generated.
petervdonovan Feb 6, 2023
f066f42
Struggle to get assembly to work.
petervdonovan Feb 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
.bloop/
.bsp/
.idea/
.metals/
.scala-build/
.vscode/
project/
target/
Expand Down
2 changes: 1 addition & 1 deletion flexpret
2 changes: 1 addition & 1 deletion programs/HelloWorld/hello.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ int main2() {

int main3() {
_fp_print(43);
}
}
10 changes: 10 additions & 0 deletions programs/benchmarks/noc/latency_aligned/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
build:
riscv_compile.sh ispm noc_latency_aligned.c

clean:
riscv_clean.sh


rebuild: clean build

PHONY: build clean rebuild
34 changes: 34 additions & 0 deletions programs/benchmarks/noc/latency_aligned/align.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#define WAIT_FOR_NEXT_ZERO_MOD_1024(id) \
"li t0, 1014\n\t" \
"li a0, 1\n\t" \
"li a1, 2\n\t" \
"li a2, 3\n\t" \
"li a3, 4\n\t" \
"li a4, 5\n\t" \
"li a5, 6\n\t" \
"li t6, 7\n\t" \
"rdcycle t1\n\t" \
"andi t1, t1, 7\n\t" \
"beq t1, t6, LOOP" #id "\n\t" \
"beq t1, a5, LOOP" #id "\n\t" \
"beq t1, a4, LOOP" #id "\n\t" \
"beq t1, a3, LOOP" #id "\n\t" \
"beq t1, a2, LOOP" #id "\n\t" \
"beq t1, a1, LOOP" #id "\n\t" \
"beq t1, a0, LOOP" #id "\n\t" \
"beq t1, x0, LOOP" #id "\n\t" \
/* This entire loop is 8 cycles long, so the value of t1 upon exiting is t0 plus a */ \
/* number in the range [0, 7] */ \
"LOOP" #id ":\n\t" \
"nop\n\t" /* Delay so that loop length is a power of 2 */ \
"nop\n\t" \
"nop\n\t" \
"rdcycle t1\n\t" \
"andi t1, t1, 1023\n\t" \
"blt t1, t0, LOOP" #id "\n\t" /* Cost of 3 cycles when taken, 1 otherwise; see page 37 https://www2.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-181.pdf */ \
"nop\n\t" \
"nop\n\t" \
"nop\n\t" \
"nop\n\t" \
"nop\n\t" \
"nop\n\t"
80 changes: 80 additions & 0 deletions programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* This program explores the absolute minimum amount of time that it can take to send one word and
* write it into a register on another core, when under the most favorable circumstances,
* and when controlling relative timing and optimizing the assembly.
*/
#include <stdint.h>
#include <flexpret_io.h>
#include <flexpret_noc.h>
#include <stdlib.h>

#include "align.h"

#define N 100

static int main_of(uint32_t core);

static int send_main(uint32_t receiver);
static int receive_main(uint32_t sender);

int main() {
unsigned long coreid = read_csr(CSR_COREID);
srand(coreid);
if (coreid == 0) for (int i = 0; i < 10; i++) send_main(1);
if (coreid == 1) for (int i = 0; i < 10; i++) receive_main(0);
}

static int send_main(uint32_t receiver) {
asm volatile (
"li t4, 0x40000000\n\t"
WAIT_FOR_NEXT_ZERO_MOD_1024(send) // clobber "a" registers, as well as t0, t1, t6
// like noc_send, but without blocking
"li t5, 0x1\n\t" // noc destination
"sw t5, 8(t4)\n\t"
"li t5, 0x08\n\t"
"sw t5, 4(t4)\n\t"
"nop\n\t"
"nop\n\t"
"li t5, 42\n\t" // Set noc data to 42
"sw t5, 8(t4)\n\t" // FIXME: Data must be written first? Why? Is it Hardware Bug?
petervdonovan marked this conversation as resolved.
Show resolved Hide resolved
"li t5, 0x04\n\t"
"sw t5, 4(t4)\n\t"
);
}

static int receive_main(uint32_t sender) {
asm volatile (
WAIT_FOR_NEXT_ZERO_MOD_1024(receive)
// "nop\n\t" // The 9-cycle read loop is aligned optimally when the number of nops here is zero mod 9
// "nop\n\t"
// "nop\n\t"
// "nop\n\t"
// "nop\n\t"
// "nop\n\t"
// "nop\n\t"
// "nop\n\t"
// "nop\n\t"
"li t4, 0x40000000\n\t" // wishbone base address
// FIXME: Why does this loop have to go through one iteration extra the first time around, compared to the number of iterations that it makes thereafter?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Idem

"CHECK_IF_RECEIVED_YET:\n\t"
// Sadly, this whole sequence -- store, wait, read, mask, beq -- must be in the loop. In particular, if the store is factored out, the read doesn't work, even though we are storing the same thing each time.
"sw x0, 0(t4)\n\t" // Write the address of NoC CSR to Wishbone read address
"nop\n\t"
"nop\n\t"
"lw t5, 12(t4)\n\t" // Read NoC CSR
"andi t5, t5, 2\n\t"
"beq x0, t5, CHECK_IF_RECEIVED_YET\n\t"
"li t5, 4\n\t" // Write the address of NoC data to Wishbone read address
"sw t5, 0(t4)\n\t"
"nop\n\t"
"nop\n\t"
"lw t5, 12(t4)\n\t" // Read NoC data
"rdcycle t3\n\t"
"andi t3, t3, 1023\n\t"
"li t0, 0xbaaabaaa\n\t"
"csrw 0x51e, t0\n\t"
"csrw 0x51e, t3\n\t"
"csrw 0x51e, t0\n\t"
"csrw 0x51e, t5\n\t"
);
}
10 changes: 10 additions & 0 deletions programs/benchmarks/noc/latency_random_sparse_send/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
build:
riscv_compile.sh ispm noc_latency_random_sparse_send.c

clean:
riscv_clean.sh


rebuild: clean build

PHONY: build clean rebuild
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include <stdint.h>
#include <flexpret_io.h>
#include <flexpret_noc.h>
#include <stdlib.h>

#define N 100
// 1 << LOG2_OF_A_LONG_TIME should be much greater than the number of cycles required to run
// one iteration of the benchmark. I think it takes less than 512 cycles to run one iteration
// of the benchmark.
#define LOG2_OF_A_LONG_TIME 11

static int main_of(uint32_t core);

int main() {
unsigned long coreid = read_csr(CSR_COREID);
srand(coreid);
main_of(coreid);
}

static int send_main(uint32_t receiver) {
for (uint32_t i = 0; i < N; i++) {
uint32_t min_delay = 1 << LOG2_OF_A_LONG_TIME;
uint32_t additional_delay = rand() & ((1 << LOG2_OF_A_LONG_TIME) - 1);
unsigned long end_time = rdcycle() + min_delay + additional_delay;
while (rdcycle() < end_time) {}
unsigned long t0 = rdcycle(); // benchmark start
noc_send(receiver, t0);
}
}

static int receive_main(uint32_t sender) {
for (uint32_t i = 0; i < N; i++) {
uint32_t t0 = noc_receive();
uint32_t t1 = rdcycle(); // benchmark end
_fp_print((sender + 1) * 1000000 + t1 - t0);
}
}

static int send_receive(uint32_t partner, int first) {
first ? send_main(partner) : receive_main(partner);
!first ? send_main(partner) : receive_main(partner);
}

static int main_of(uint32_t core) {
int big = core & 2;
int odd = core & 1;
send_receive((core + 1) & 3, !odd);
send_receive((core + 2) & 3, !big);
send_receive((core + 3) & 3, !odd);
}
10 changes: 10 additions & 0 deletions programs/noc/LowLevelInterface/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
build:
riscv_compile.sh ispm low_level_interface_noc.c

clean:
riscv_clean.sh


rebuild: clean build

PHONY: build clean rebuild
58 changes: 58 additions & 0 deletions programs/noc/LowLevelInterface/asm_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#ifndef ASM_UTILS_H
#define ASM_UTILS_H

#define TOKENPASTE2__(x, y) x ## y
#define TOKENPASTE1__(x, y) TOKENPASTE2__(x, y)
#define TOKENPASTE(x, y) TOKENPASTE1(x, y)

/**
* @brief Pure assembly version of _fp_print. Executes in 4 cycles.
*/
#define FP_PRINT_ASM(reg, clobber0) \
"li " #clobber0 ", 0xbaaabaaa\n\t" \
"csrw 0x51e, " #clobber0 "\n\t" \
"csrw 0x51e, " #reg "\n\t"

#define TRUE_MACRO(case_true, case_false) case_true
#define FALSE_MACRO(case_true, case_false) case_false

#define REPEAT1(x) x
#define REPEAT2(x) x x
#define REPEAT3(x) x x x
#define REPEAT4(x) REPEAT2(REPEAT2(x))
#define REPEAT5(x) REPEAT2(x) REPEAT3(x)
#define REPEAT7(x) REPEAT2(x) REPEAT5(x)
#define REPEAT11(x) REPEAT7(x) REPEAT4(x)
#define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x)))

#define COUNT_TO_20(MACRO, param0, param1, param2) \
MACRO(param0, param1, param2, 0) \
MACRO(param0, param1, param2, 1) \
MACRO(param0, param1, param2, 2) \
MACRO(param0, param1, param2, 3) \
MACRO(param0, param1, param2, 4) \
MACRO(param0, param1, param2, 5) \
MACRO(param0, param1, param2, 6) \
MACRO(param0, param1, param2, 7) \
MACRO(param0, param1, param2, 8) \
MACRO(param0, param1, param2, 9) \
MACRO(param0, param1, param2, 10) \
MACRO(param0, param1, param2, 11) \
MACRO(param0, param1, param2, 12) \
MACRO(param0, param1, param2, 13) \
MACRO(param0, param1, param2, 14) \
MACRO(param0, param1, param2, 15) \
MACRO(param0, param1, param2, 16) \
MACRO(param0, param1, param2, 17) \
MACRO(param0, param1, param2, 18) \
MACRO(param0, param1, param2, 19)

#define MUL4(in_reg, out_reg) \
"slli " #out_reg ", " #in_reg ", 2\n\t"

#define MUL5(in_reg, out_reg) \
MUL4(in_reg, out_reg) \
"add " #out_reg ", " #out_reg ", " #in_reg "\n\t"


#endif // ASM_UTILS_H
Loading