Skip to content

Commit

Permalink
save progress
Browse files Browse the repository at this point in the history
  • Loading branch information
clamchowder committed Nov 30, 2024
1 parent 76afd75 commit d1786ed
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 10 deletions.
118 changes: 118 additions & 0 deletions Common/perfmon.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// Stuff that only works on Linux. Should be #ifdef-ed out for mingw cross compilation
uint64_t readmsr(uint32_t coreindex, uint32_t msrindex) {
char buf[256];
memset(buf, 0, 256);
Expand All @@ -15,3 +16,120 @@ uint64_t readmsr(uint32_t coreindex, uint32_t msrindex) {
close(fd);
return msrvalue;
}

#define PERF_NUM_EVENTS 4
struct perf_read_data {
uint64_t nr;
struct {
uint64_t value;
uint64_t id;
} values[PERF_NUM_EVENTS];
};

struct perf_select_data {
uint64_t id; // id used to identify the event when it comes back in a group
int fd; // file descriptor
struct perf_event_attr attr;
uint64_t value;
const char *description;
};

struct perf_select_data perf_selected_events[PERF_NUM_EVENTS];
struct perf_read_data perfReadData;
struct timeval perf_startTv, perf_endTv;
uint64_t perf_time_ms;

// populates basic properties
void initialize_hw_event(struct perf_event_attr *attr, uint64_t cfg, uint32_t hwid) {
memset(attr, 0, sizeof(struct perf_event_attr));

// low 32 bits of config = hardware event id
// high 32 bits = PMU id (atom/core). Get from /sys/devices/<the thing>/type
// on Arrow Lake, atom = 10, core = 4
attr->config = cfg | ((uint64_t)hwid << 32);
attr->type = PERF_TYPE_HARDWARE;
attr->size = sizeof(struct perf_event_attr);
attr->disabled = 1;
attr->exclude_kernel = 1;
attr->exclude_hv = 1;
attr->inherit = 1; // include child threads
attr->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
}

void set_hw_event(struct perf_select_data *evt, int groupfd) {
evt->fd = syscall(__NR_perf_event_open, &(evt->attr), 0, -1, groupfd, 0);
ioctl(evt->fd, PERF_EVENT_IOC_ID, &(evt->id));
}

void open_perf_monitoring() {
int groupLeaderFd = -1;
memset(perf_selected_events, 0, sizeof(struct perf_select_data) * PERF_NUM_EVENTS);

perf_selected_events[0].description = "instructions";
initialize_hw_event(&(perf_selected_events[0].attr), PERF_COUNT_HW_INSTRUCTIONS, 0);
set_hw_event(perf_selected_events, -1);
groupLeaderFd = perf_selected_events[0].fd;

perf_selected_events[1].description = "cycles";
initialize_hw_event(&(perf_selected_events[1].attr), PERF_COUNT_HW_CPU_CYCLES, 0);
set_hw_event(perf_selected_events + 1, groupLeaderFd);

perf_selected_events[2].description = "llc_ref";
initialize_hw_event(&(perf_selected_events[2].attr), 0x4F2E, 0);
perf_selected_events[2].attr.type = PERF_TYPE_RAW;
set_hw_event(perf_selected_events + 2, groupLeaderFd);

perf_selected_events[3].description = "llc_miss";
initialize_hw_event(&(perf_selected_events[3].attr), 0x412E, 0);
perf_selected_events[3].attr.type = PERF_TYPE_RAW;
set_hw_event(perf_selected_events + 3, groupLeaderFd);
}

void start_perf_monitoring() {
gettimeofday(&perf_startTv, NULL);
int groupLeaderFd = perf_selected_events[0].fd;
ioctl(groupLeaderFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(groupLeaderFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}

uint64_t instrs, cycles, llcRef, llcMiss;
void stop_perf_monitoring() {
int readbytes = 0;
int groupLeaderFd = perf_selected_events[0].fd;
ioctl(groupLeaderFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
// fprintf(stderr, "read %d bytes\n", sizeof(struct perf_read_data));
readbytes = read(groupLeaderFd, &perfReadData, sizeof(struct perf_read_data));
//fprintf(stderr, "Read %d bytes into perf_read_data. nr = %lu\n", readbytes, perfReadData.nr);
for (int i = 0; i < perfReadData.nr; i++) {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
if (perf_selected_events[evt_idx].id == perfReadData.values[i].id) {
struct perf_select_data *selected_evt = perf_selected_events + evt_idx;
selected_evt->value = perfReadData.values[i].value;
// fprintf(stderr, "%s: %lu\n", selected_evt->description, selected_evt->value);
}
}
}

gettimeofday(&perf_endTv, NULL);
perf_time_ms = ((perf_endTv.tv_sec - perf_startTv.tv_sec) * 1000 + (perf_endTv.tv_usec - perf_startTv.tv_usec) / 1000);
}

void close_perf_monitoring() {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) close(perf_selected_events[evt_idx].fd);
}

void append_perf_header() {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
printf(",%s", perf_selected_events[evt_idx].description);
}

printf(",Time (ms)");
}

void append_perf_values() {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
printf(",%lu", perf_selected_events[evt_idx].value);
}

printf(",%lu", perf_time_ms);
}
4 changes: 2 additions & 2 deletions LoadedMemoryLatency/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
amd64:
gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o LoadedMemoryLatency -lm
gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o loadedlat_amd64 -lm
aarch64:
gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o LoadedMemoryLatency -lm
gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o loadedlat_aarch64 -lm
54 changes: 46 additions & 8 deletions MemoryBandwidth/MemoryBandwidth.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,26 @@
#include <stdint.h>
#include <string.h>

#ifndef __MINGW32__
#include <sys/syscall.h>
#endif

#include <sys/time.h>
#include <unistd.h>
#include <sched.h>
#include <pthread.h>
#include <sched.h>
#include <math.h>
#include <sys/mman.h>
#include <errno.h>

#ifndef __MINGW32__
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "../Common/perfmon.h"
#endif

#ifdef NUMA
#include <sys/sysinfo.h>
#include <numa.h>
Expand Down Expand Up @@ -118,6 +125,8 @@ int hardaffinity = 0;
int numa = 0;
#endif

int pmon = 0;

int main(int argc, char *argv[]) {
int threads = 1;
int cpuid_data[4];
Expand Down Expand Up @@ -188,6 +197,12 @@ int main(int argc, char *argv[]) {
autothreads = atoi(argv[argIdx]);
fprintf(stderr, "Testing bw scaling up to %d threads\n", autothreads);
}
#ifndef __MINGW32__
else if (strncmp(arg, "pmon", 4) == 0) {
pmon = 1;
fprintf(stderr, "Using hardware performance monitoring\n");
}
#endif
#ifdef NUMA
else if (strncmp(arg, "numa", 4) == 0) {
argIdx++;
Expand Down Expand Up @@ -423,18 +438,35 @@ int main(int argc, char *argv[]) {
#endif
else {
printf("Using %d threads\n", threads);
printf("Size (KB),Bandwidth (GB/s)");
#ifndef __MINGW32__
if (pmon) {
open_perf_monitoring();
append_perf_header();
}
#endif
printf("\n");
if (singleSize == 0)
{
for (int i = 0; i < testSizeCount; i++)
{
printf("%d,%f\n", default_test_sizes[i], MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, nopBytes, 0, 0));
printf("%d,%f", default_test_sizes[i], MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, nopBytes, 0, 0));

#ifndef __MINGW32__
if (pmon) append_perf_values();
#endif
printf("\n");
if (sleepTime > 0) sleep(sleepTime);
}
}
else
{
printf("%d,%f\n", singleSize, MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, nopBytes, 0, 0));
printf("%d,%f", singleSize, MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, nopBytes, 0, 0));
append_perf_values();
printf("\n");
}

close_perf_monitoring();
}

return 0;
Expand Down Expand Up @@ -753,11 +785,17 @@ float MeasureBw(uint64_t sizeKb, uint64_t iterations, uint64_t threads, int shar
//int pthreadRc = pthread_create(testThreads + i, NULL, ReadBandwidthTestThread, (void *)(threadData + i));
}


uint64_t instructions, cycles;
#ifndef __MINGW32__
if (pmon) start_perf_monitoring();
#endif
gettimeofday(&startTv, &startTz);
for (uint64_t i = 0; i < threads; i++) pthread_create(testThreads + i, NULL, ReadBandwidthTestThread, (void *)(threadData + i));
for (uint64_t i = 0; i < threads; i++) pthread_join(testThreads[i], NULL);
gettimeofday(&endTv, &endTz);
#ifndef __MINGW32__
if (pmon) stop_perf_monitoring(&instructions, &cycles);
#endif

uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
double gbTransferred = iterations * sizeof(float) * elements * threads / (double)1e9;
Expand Down

0 comments on commit d1786ed

Please sign in to comment.