-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a8d7830
commit 5b3cea8
Showing
7 changed files
with
286 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
aarch64: | ||
gcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
extern uint64_t vec_int32_add_test(uint64_t iterations, void *data); | ||
extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data); | ||
extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data); | ||
extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data); | ||
|
||
void RunTests() { | ||
uint64_t iterations = 3500000000; | ||
int testDataLength = 256; | ||
uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); | ||
uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); | ||
for (int i = 0; i < testDataLength; i++) { | ||
intTestArr[i] = i; | ||
fpTestArr[i] = i * 1.2f; | ||
} | ||
|
||
fprintf(stderr, "Measuring INT32 adds\n"); | ||
float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr); | ||
fprintf(stderr, "Measuring INT32 multiplies\n"); | ||
float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr); | ||
fprintf(stderr, "Measuring FP32 adds\n"); | ||
float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr); | ||
fprintf(stderr, "Measuring FP32 FMAs\n"); | ||
float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr); | ||
|
||
printf("-----GOPS/s-----\n"); | ||
printf("INT32 Add: %f\n", int32adds); | ||
printf("INT32 Multiply: %f\n", int32muls); | ||
printf("FP32 Add: %f\n", fp32adds); | ||
printf("FP32 FMA: %f\n", fp32fmas); | ||
|
||
free(intTestArr); | ||
free(fpTestArr); | ||
return; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
.text | ||
|
||
.global vec_int32_add_test | ||
.global vec_int32_mul_test | ||
.global vec_fp32_add_test | ||
.global vec_fp32_fma_test | ||
|
||
/* x0 = iteration count, x1 = data */ | ||
vec_int32_add_test: | ||
mov x14, 24 | ||
ldr q16, [x1] | ||
ldr q17, [x1] | ||
ldr q18, [x1] | ||
ldr q19, [x1] | ||
ldr q20, [x1] | ||
ldr q21, [x1] | ||
vec_int32_add_test_loop: | ||
add v16.4s, v16.4s, v16.4s | ||
add v17.4s, v17.4s, v17.4s | ||
add v18.4s, v18.4s, v18.4s | ||
add v19.4s, v19.4s, v19.4s | ||
add v20.4s, v20.4s, v20.4s | ||
add v21.4s, v21.4s, v21.4s | ||
sub x0, x0, x14 | ||
cmp x0, 0 | ||
b.gt vec_int32_add_test_loop | ||
ret | ||
|
||
vec_int32_mul_test: | ||
mov x14, 24 | ||
ldr q16, [x1] | ||
ldr q17, [x1] | ||
ldr q18, [x1] | ||
ldr q19, [x1] | ||
ldr q20, [x1] | ||
ldr q21, [x1] | ||
vec_int32_mul_test_loop: | ||
mul v16.4s, v16.4s, v16.4s | ||
mul v17.4s, v17.4s, v17.4s | ||
mul v18.4s, v18.4s, v18.4s | ||
mul v19.4s, v19.4s, v19.4s | ||
mul v20.4s, v20.4s, v20.4s | ||
mul v21.4s, v21.4s, v21.4s | ||
sub x0, x0, x14 | ||
cmp x0, 0 | ||
b.gt vec_int32_mul_test_loop | ||
ret | ||
|
||
vec_fp32_add_test: | ||
mov x14, 24 | ||
ldr q16, [x1] | ||
ldr q17, [x1] | ||
ldr q18, [x1] | ||
ldr q19, [x1] | ||
ldr q20, [x1] | ||
ldr q21, [x1] | ||
vec_fp32_add_test_loop: | ||
fadd v16.4s, v16.4s, v16.4s | ||
fadd v17.4s, v17.4s, v17.4s | ||
fadd v18.4s, v18.4s, v18.4s | ||
fadd v19.4s, v19.4s, v19.4s | ||
fadd v20.4s, v20.4s, v20.4s | ||
fadd v21.4s, v21.4s, v21.4s | ||
sub x0, x0, x14 | ||
cmp x0, 0 | ||
b.gt vec_fp32_add_test_loop | ||
ret | ||
|
||
vec_fp32_fma_test: | ||
mov x14, 24 | ||
ldr q16, [x1] | ||
ldr q17, [x1] | ||
ldr q18, [x1] | ||
ldr q19, [x1] | ||
ldr q20, [x1] | ||
ldr q21, [x1] | ||
vec_fp32_fma_test_loop: | ||
fmla v16.4s, v16.4s, v16.4s | ||
fmla v17.4s, v17.4s, v17.4s | ||
fmla v18.4s, v18.4s, v18.4s | ||
fmla v19.4s, v19.4s, v19.4s | ||
fmla v20.4s, v20.4s, v20.4s | ||
fmla v21.4s, v21.4s, v21.4s | ||
sub x0, x0, x14 | ||
cmp x0, 0 | ||
b.gt vec_fp32_fma_test_loop | ||
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
#define _GNU_SOURCE | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <stdint.h> | ||
#include <math.h> | ||
#include <string.h> | ||
#include <pthread.h> | ||
|
||
#include "../Common/timing.h" | ||
|
||
struct TestThreadData { | ||
float timeMs; // written by thread to indicate elapsed runtime for that thread | ||
uint64_t iterations; | ||
void *testData; | ||
int core; // -1 = don't set affinity. otherwise set affinity to specified core | ||
uint64_t (*testfunc)(uint64_t, void *); | ||
}; | ||
|
||
float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *), void *data); | ||
void *TestThread(void *param); | ||
|
||
int threadCount = 1; | ||
int *coreList = NULL; | ||
|
||
#ifdef __aarch64__ | ||
#include "arm_mt_instructionrate.c" | ||
#endif | ||
|
||
int main(int argc, char *argv[]) { | ||
char parseBuffer[512]; | ||
int parseIndices[64]; | ||
|
||
for (int argIdx = 1; argIdx < argc; argIdx++) { | ||
if (*(argv[argIdx]) == '-') { | ||
char *arg = argv[argIdx] + 1; | ||
if (strncmp(arg, "threads", 7) == 0) { | ||
argIdx++; | ||
threadCount = atoi(argv[argIdx]); | ||
fprintf(stderr, "Using first %d cores\n", threadCount); | ||
} else if (strncmp(arg, "cores", 5) == 0) { | ||
argIdx++; | ||
|
||
// whatever just parse it here | ||
strncpy(parseBuffer, argv[argIdx], 511); | ||
parseIndices[0] = 0; | ||
int indexIdx = 1; | ||
threadCount = 1; | ||
for (int i = 0; i < 512 && indexIdx < 64; i++) { | ||
if (parseBuffer[i] == ',') { | ||
parseBuffer[i] = '\0'; | ||
parseIndices[indexIdx] = i + 1; | ||
indexIdx++; | ||
threadCount++; | ||
} | ||
} | ||
|
||
coreList = malloc(sizeof(int) * threadCount); | ||
|
||
fprintf(stderr, "Using %d cores:", threadCount); | ||
for (int i = 0;i < threadCount; i++) { | ||
coreList[i] = atoi(parseBuffer + parseIndices[i]); | ||
fprintf(stderr, " %d", coreList[i]); | ||
} | ||
|
||
fprintf(stderr, "\n"); | ||
} | ||
} | ||
} | ||
|
||
RunTests(); | ||
|
||
free(coreList); | ||
return 0; | ||
} | ||
|
||
// return billion operations per second | ||
// test function must perform iterations ops | ||
float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *), void *data){ | ||
int toleranceMet = 0, minTimeMet = 0; | ||
unsigned int timeMs; | ||
pthread_t *testThreads = (pthread_t *)malloc(threadCount * sizeof(pthread_t)); | ||
struct TestThreadData *testData = (struct TestThreadData *)malloc(threadCount * sizeof(struct TestThreadData)); | ||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) { | ||
testData[threadIdx].iterations = baseIterations; | ||
testData[threadIdx].testData = data; | ||
testData[threadIdx].testfunc = testFunc; | ||
if (coreList == NULL) testData[threadIdx].core = threadIdx; | ||
else testData[threadIdx].core = coreList[threadIdx]; | ||
} | ||
|
||
do { | ||
start_timing(); | ||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) { | ||
pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx); | ||
} | ||
|
||
float maxThreadTime = -1, minThreadTime = -1; | ||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) { | ||
pthread_join(testThreads[threadIdx], NULL); | ||
fprintf(stderr, "Thread %d took %f ms\n", threadIdx, testData[threadIdx].timeMs); | ||
if (maxThreadTime < 0 || testData[threadIdx].timeMs > maxThreadTime) maxThreadTime = testData[threadIdx].timeMs; | ||
if (minThreadTime < 0 || testData[threadIdx].timeMs < minThreadTime) minThreadTime = testData[threadIdx].timeMs; | ||
} | ||
|
||
timeMs = end_timing(); | ||
minTimeMet = timeMs > 2000; // see if 2 seconds will work | ||
toleranceMet = ((maxThreadTime - minThreadTime) / minThreadTime) < 0.1f; // allow 10% variation? | ||
|
||
if (!minTimeMet) { | ||
// Increase iteration count with 3s target | ||
baseIterations = scale_iterations_to_target(baseIterations, (float)timeMs, 3000.0f); | ||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) { | ||
testData[threadIdx].iterations = baseIterations; | ||
} | ||
|
||
fprintf(stderr, "Setting %lu iterations\n", baseIterations); | ||
} else if (!toleranceMet) { | ||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) { | ||
testData[threadIdx].iterations = scale_iterations_to_target( | ||
testData[threadIdx].iterations, | ||
testData[threadIdx].timeMs, | ||
maxThreadTime); | ||
fprintf(stderr, "Thread %d -> %lu iterations\n", threadIdx, testData[threadIdx].iterations); | ||
} | ||
} | ||
} while ((!toleranceMet) || (!minTimeMet)); | ||
|
||
fprintf(stderr, "time elapsed: %d ms\n", timeMs); | ||
|
||
uint64_t totalIterations = 0; | ||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) { | ||
totalIterations += testData[threadIdx].iterations; | ||
} | ||
|
||
free(testData); | ||
free(testThreads); | ||
|
||
return (1000 * totalIterations / timeMs) / 1e9; | ||
} | ||
|
||
void *TestThread(void *param) { | ||
struct TestThreadData *testData = (struct TestThreadData *)param; | ||
struct timeval startTv, endTv; | ||
if (testData->core >= 0) { | ||
cpu_set_t cpuset; | ||
CPU_ZERO(&cpuset); | ||
CPU_SET(testData->core, &cpuset); | ||
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); | ||
} | ||
|
||
gettimeofday(&startTv, NULL); | ||
testData->testfunc(testData->iterations, testData->testData); | ||
gettimeofday(&endTv, NULL); | ||
testData->timeMs = (float)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000); | ||
return NULL; | ||
} |