Skip to content

Commit

Permalink
mt irate
Browse files Browse the repository at this point in the history
  • Loading branch information
clamchowder committed Jan 16, 2024
1 parent a8d7830 commit 5b3cea8
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Common/timing.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@ unsigned int end_timing() {
return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000);
}
#endif

unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) {
// safety measure to deal with nasty timer precision issues if the system is fast
if (last_time < 50) return last_iteration_count * 2;
return last_iteration_count * (target_time / last_time);
}
1 change: 1 addition & 0 deletions Common/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
extern struct timeb start, end;
inline void start_timing();
inline unsigned int end_timing();
unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time);
#endif
2 changes: 2 additions & 0 deletions mt_instructionrate/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
aarch64:
gcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate
Binary file added mt_instructionrate/arm_mt_instructionrate
Binary file not shown.
34 changes: 34 additions & 0 deletions mt_instructionrate/arm_mt_instructionrate.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
extern uint64_t vec_int32_add_test(uint64_t iterations, void *data);
extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data);

void RunTests() {
uint64_t iterations = 3500000000;
int testDataLength = 256;
uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
for (int i = 0; i < testDataLength; i++) {
intTestArr[i] = i;
fpTestArr[i] = i * 1.2f;
}

fprintf(stderr, "Measuring INT32 adds\n");
float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr);
fprintf(stderr, "Measuring INT32 multiplies\n");
float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr);
fprintf(stderr, "Measuring FP32 adds\n");
float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr);
fprintf(stderr, "Measuring FP32 FMAs\n");
float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr);

printf("-----GOPS/s-----\n");
printf("INT32 Add: %f\n", int32adds);
printf("INT32 Multiply: %f\n", int32muls);
printf("FP32 Add: %f\n", fp32adds);
printf("FP32 FMA: %f\n", fp32fmas);

free(intTestArr);
free(fpTestArr);
return;
}
87 changes: 87 additions & 0 deletions mt_instructionrate/arm_mt_instructionrate.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
.text

.global vec_int32_add_test
.global vec_int32_mul_test
.global vec_fp32_add_test
.global vec_fp32_fma_test

/* x0 = iteration count, x1 = data */
vec_int32_add_test:
mov x14, 24
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vec_int32_add_test_loop:
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
add v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
sub x0, x0, x14
cmp x0, 0
b.gt vec_int32_add_test_loop
ret

vec_int32_mul_test:
mov x14, 24
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vec_int32_mul_test_loop:
mul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
mul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
sub x0, x0, x14
cmp x0, 0
b.gt vec_int32_mul_test_loop
ret

vec_fp32_add_test:
mov x14, 24
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vec_fp32_add_test_loop:
fadd v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
sub x0, x0, x14
cmp x0, 0
b.gt vec_fp32_add_test_loop
ret

vec_fp32_fma_test:
mov x14, 24
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vec_fp32_fma_test_loop:
fmla v16.4s, v16.4s, v16.4s
fmla v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fmla v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fmla v21.4s, v21.4s, v21.4s
sub x0, x0, x14
cmp x0, 0
b.gt vec_fp32_fma_test_loop
ret
156 changes: 156 additions & 0 deletions mt_instructionrate/mt_instructionrate.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <string.h>
#include <pthread.h>

#include "../Common/timing.h"

struct TestThreadData {
float timeMs; // written by thread to indicate elapsed runtime for that thread
uint64_t iterations;
void *testData;
int core; // -1 = don't set affinity. otherwise set affinity to specified core
uint64_t (*testfunc)(uint64_t, void *);
};

float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *), void *data);
void *TestThread(void *param);

int threadCount = 1;
int *coreList = NULL;

#ifdef __aarch64__
#include "arm_mt_instructionrate.c"
#endif

int main(int argc, char *argv[]) {
char parseBuffer[512];
int parseIndices[64];

for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char *arg = argv[argIdx] + 1;
if (strncmp(arg, "threads", 7) == 0) {
argIdx++;
threadCount = atoi(argv[argIdx]);
fprintf(stderr, "Using first %d cores\n", threadCount);
} else if (strncmp(arg, "cores", 5) == 0) {
argIdx++;

// whatever just parse it here
strncpy(parseBuffer, argv[argIdx], 511);
parseIndices[0] = 0;
int indexIdx = 1;
threadCount = 1;
for (int i = 0; i < 512 && indexIdx < 64; i++) {
if (parseBuffer[i] == ',') {
parseBuffer[i] = '\0';
parseIndices[indexIdx] = i + 1;
indexIdx++;
threadCount++;
}
}

coreList = malloc(sizeof(int) * threadCount);

fprintf(stderr, "Using %d cores:", threadCount);
for (int i = 0;i < threadCount; i++) {
coreList[i] = atoi(parseBuffer + parseIndices[i]);
fprintf(stderr, " %d", coreList[i]);
}

fprintf(stderr, "\n");
}
}
}

RunTests();

free(coreList);
return 0;
}

// return billion operations per second
// test function must perform iterations ops
float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *), void *data){
int toleranceMet = 0, minTimeMet = 0;
unsigned int timeMs;
pthread_t *testThreads = (pthread_t *)malloc(threadCount * sizeof(pthread_t));
struct TestThreadData *testData = (struct TestThreadData *)malloc(threadCount * sizeof(struct TestThreadData));
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
testData[threadIdx].iterations = baseIterations;
testData[threadIdx].testData = data;
testData[threadIdx].testfunc = testFunc;
if (coreList == NULL) testData[threadIdx].core = threadIdx;
else testData[threadIdx].core = coreList[threadIdx];
}

do {
start_timing();
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);
}

float maxThreadTime = -1, minThreadTime = -1;
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
pthread_join(testThreads[threadIdx], NULL);
fprintf(stderr, "Thread %d took %f ms\n", threadIdx, testData[threadIdx].timeMs);
if (maxThreadTime < 0 || testData[threadIdx].timeMs > maxThreadTime) maxThreadTime = testData[threadIdx].timeMs;
if (minThreadTime < 0 || testData[threadIdx].timeMs < minThreadTime) minThreadTime = testData[threadIdx].timeMs;
}

timeMs = end_timing();
minTimeMet = timeMs > 2000; // see if 2 seconds will work
toleranceMet = ((maxThreadTime - minThreadTime) / minThreadTime) < 0.1f; // allow 10% variation?

if (!minTimeMet) {
// Increase iteration count with 3s target
baseIterations = scale_iterations_to_target(baseIterations, (float)timeMs, 3000.0f);
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
testData[threadIdx].iterations = baseIterations;
}

fprintf(stderr, "Setting %lu iterations\n", baseIterations);
} else if (!toleranceMet) {
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
testData[threadIdx].iterations = scale_iterations_to_target(
testData[threadIdx].iterations,
testData[threadIdx].timeMs,
maxThreadTime);
fprintf(stderr, "Thread %d -> %lu iterations\n", threadIdx, testData[threadIdx].iterations);
}
}
} while ((!toleranceMet) || (!minTimeMet));

fprintf(stderr, "time elapsed: %d ms\n", timeMs);

uint64_t totalIterations = 0;
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
totalIterations += testData[threadIdx].iterations;
}

free(testData);
free(testThreads);

return (1000 * totalIterations / timeMs) / 1e9;
}

void *TestThread(void *param) {
struct TestThreadData *testData = (struct TestThreadData *)param;
struct timeval startTv, endTv;
if (testData->core >= 0) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(testData->core, &cpuset);
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
}

gettimeofday(&startTv, NULL);
testData->testfunc(testData->iterations, testData->testData);
gettimeofday(&endTv, NULL);
testData->timeMs = (float)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000);
return NULL;
}

0 comments on commit 5b3cea8

Please sign in to comment.