-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding ATMI as the runtime layer to launch Chapel's generated GPU ker…
…nels
- Loading branch information
Showing
7 changed files
with
302 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#ifndef _chpl_atmi_h_ | ||
#define _chpl_atmi_h_ | ||
|
||
#include <atmi_runtime.h> | ||
#include <stddef.h> /* size_t */ | ||
#include <stdint.h> /* uintXX_t */ | ||
#ifndef __cplusplus | ||
#include <stdbool.h> | ||
#endif /* __cplusplus */ | ||
|
||
#include "chpltypes.h" | ||
#include "chpl-hsa-kernelparams.h" | ||
|
||
atmi_kernel_t reduction_kernel; | ||
atmi_kernel_t *gpu_kernels; | ||
|
||
enum { | ||
GPU_KERNEL_IMPL = 10565, | ||
REDUCTION_GPU_IMPL = 42 | ||
}; | ||
/* | ||
typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) { | ||
uint64_t in; | ||
uint64_t out; | ||
uint32_t count; | ||
} hsail_reduce_kernarg_t; | ||
typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) { | ||
uint64_t bundle; | ||
} hsail_kernarg_t; | ||
*/ | ||
|
||
int chpl_hsa_initialize(void); | ||
|
||
int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count); | ||
int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count); | ||
|
||
void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, | ||
uint32_t wkitem_count_x, void *bundled_args); | ||
#endif //_chpl_atmi_h_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
|
||
#include "chpl-atmi.h" | ||
#include "chplrt.h" | ||
#include "chplexit.h" | ||
#include "chpl-mem.h" | ||
|
||
/*enum ReduceOp { | ||
MAX, | ||
MIN, | ||
SUM, | ||
PROD, | ||
BITAND, | ||
BITOR, | ||
BITXOR, | ||
LOGAND, | ||
LOGOR | ||
}; | ||
*/ | ||
|
||
/* | ||
* Estimate and schedule the required number of GPU kernels | ||
*/ | ||
static inline | ||
void atmi_sched_reducekernels(size_t count, | ||
void *darray[2], size_t *iter_ct, | ||
size_t *items_left) | ||
{ | ||
size_t incount, outcount, i, iter, in, out; | ||
uint32_t max_num_wkgrps, num_wkgroups, grid_size_x; | ||
|
||
const int num_args = 3; | ||
atmi_task_group_t task_group = {1, ATMI_TRUE}; | ||
ATMI_LPARM(lparm); | ||
lparm->group = &task_group; | ||
lparm->kernel_id = REDUCTION_GPU_IMPL; | ||
lparm->synchronous = ATMI_FALSE; | ||
lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); | ||
|
||
incount = count; | ||
max_num_wkgrps = incount / WKGRP_SIZE; | ||
num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE; | ||
grid_size_x = num_wkgroups * WKGRP_SIZE; | ||
outcount = num_wkgroups; | ||
iter = 0; | ||
while (grid_size_x > WKGRP_SIZE) { | ||
in = (iter & 1); | ||
out = (iter & 1) ^ 1; | ||
|
||
void *args[] = {&darray[in], &darray[out], &incount}; | ||
lparm->gridDim[0] = grid_size_x; | ||
lparm->groupDim[0] = WKGRP_SIZE; | ||
atmi_task_launch(lparm, reduction_kernel, args); | ||
|
||
iter += 1; | ||
incount = outcount; | ||
max_num_wkgrps = incount / WKGRP_SIZE; | ||
num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE; | ||
grid_size_x = num_wkgroups * WKGRP_SIZE; | ||
outcount = num_wkgroups; | ||
} | ||
|
||
if (iter > 0) { | ||
atmi_task_group_sync(&task_group); | ||
} | ||
|
||
(*items_left) = incount; | ||
(*iter_ct) = iter; | ||
} | ||
|
||
/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count) | ||
{ | ||
int32_t res; | ||
size_t iter, items_left, out, i; | ||
int32_t * darray[2]; | ||
hsa_symbol_info_t * symbol_info; | ||
symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index | ||
darray[0] = src; | ||
if (0 != chpl_posix_memalign((void **) &darray[1], 64, | ||
count * sizeof(int32_t))) { | ||
chpl_exit_any(1); | ||
} | ||
hsa_sched_reducekernels(count, symbol_info, (void**)darray, | ||
&iter, &items_left); | ||
res = 0; | ||
out = (iter & 1); | ||
chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left); | ||
for (i = 0; i < items_left; ++i) res += darray[out][i]; | ||
chpl_free (darray[1]); | ||
return res; | ||
}*/ | ||
|
||
int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count) | ||
{ | ||
int64_t res; | ||
size_t iter, items_left, out, i; | ||
int64_t * darray[2]; | ||
darray[0] = src; | ||
if (0 != chpl_posix_memalign((void **) &darray[1], 64, | ||
count * sizeof(int64_t))) { | ||
chpl_exit_any(1); | ||
} | ||
|
||
atmi_sched_reducekernels(count, (void**)darray, | ||
&iter, &items_left); | ||
|
||
res = 0; | ||
out = (iter & 1); | ||
chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left); | ||
for (i = 0; i < items_left; ++i) res += darray[out][i]; | ||
|
||
chpl_free (darray[1]); | ||
return res; | ||
} | ||
|
||
//FIXME: use the op argument like this to extend this to different ops | ||
/*if (!strcasecmp(op, "Max")) | ||
opType = MAX; | ||
else if (!strcasecmp(op, "Min")) | ||
opType = MIN; | ||
else if (!strcasecmp(op, "Sum")) | ||
opType = SUM; | ||
else if (!strcasecmp(op, "Product")) | ||
opType = PROD; | ||
else if (!strcasecmp(op, "LogicalAnd")) | ||
opType = LOGAND; | ||
else if (!strcasecmp(op, "LogicalOr")) | ||
opType = LOGOR; | ||
else if (!strcasecmp(op, "BitwiseAnd")) | ||
opType = BITAND; | ||
else if (!strcasecmp(op, "BitwiseOr")) | ||
opType = BITOR; | ||
else if (!strcasecmp(op, "BitwiseXor")) | ||
opType = BITXOR; */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
|
||
#define _GNU_SOURCE | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <assert.h> | ||
#include <unistd.h> | ||
#include <sys/time.h> | ||
#include "chpl-atmi.h" | ||
#include "chplrt.h" | ||
#include "chpl-mem.h" | ||
#include "chplcgfns.h" | ||
|
||
#define OUTPUT_ATMI_STATUS(status, msg) \ | ||
{ \ | ||
if (ATMI_STATUS_SUCCESS != (status)) { \ | ||
fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \ | ||
#msg, status); \ | ||
atmi_finalize(); \ | ||
return status; \ | ||
} \ | ||
} | ||
|
||
/** | ||
* Initialize the ATMI/HSA runtime | ||
*/ | ||
int chpl_hsa_initialize(void) | ||
{ | ||
atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); | ||
if(st != ATMI_STATUS_SUCCESS) return -1; | ||
|
||
char reduce_kernel_filename[1024]; | ||
char gen_kernel_filename[1024]; | ||
int arglen = strlen(chpl_executionCommand)+1; | ||
char* argCopy = chpl_mem_allocMany(arglen, sizeof(char), | ||
CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0); | ||
char *binName; | ||
int cx; | ||
|
||
cx = snprintf(reduce_kernel_filename, 1024, | ||
#ifdef ROCM | ||
"%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME, | ||
#else | ||
"%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME, | ||
#endif | ||
CHPL_RUNTIME_OBJDIR); | ||
if (cx < 0 || cx >= 256) { | ||
OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename); | ||
} | ||
strcpy(argCopy, chpl_executionCommand); | ||
binName = strtok(argCopy, " "); | ||
#ifdef ROCM | ||
cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName); | ||
#else | ||
cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName); | ||
#endif | ||
if (cx < 0 || cx >= 256) { | ||
OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename); | ||
} | ||
chpl_mem_free(argCopy, 0, 0); | ||
|
||
#ifdef ROCM | ||
atmi_platform_type_t module_type = AMDGCN; | ||
#else | ||
atmi_platform_type_t module_type = BRIG; | ||
#endif | ||
|
||
/* FIXME: Create all reduction kernels, not just the int64-sum kernel */ | ||
const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename}; | ||
atmi_platform_type_t module_types[2] = {module_type, module_type}; | ||
st = atmi_module_register(modules, module_types, 2); | ||
OUTPUT_ATMI_STATUS(st, Registering all modules); | ||
|
||
size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)}; | ||
const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]); | ||
atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes); | ||
atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL); | ||
|
||
size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; | ||
const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]); | ||
gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels); | ||
for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) { | ||
//FIXME: get the actual kernel name | ||
const char *kernel_name = chpl_gpu_kernels[i]; | ||
atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes); | ||
atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL); | ||
} | ||
|
||
return ATMI_STATUS_SUCCESS; | ||
} | ||
|
||
/** | ||
* Release resources used by the base kernels and tear down the HSA structures | ||
*/ | ||
int hsa_shutdown(void) | ||
{ | ||
chpl_free(gpu_kernels); | ||
atmi_finalize(); | ||
} | ||
|
||
/* | ||
* Enqueue/execute a kernel | ||
*/ | ||
void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, | ||
uint32_t wkitem_count_x, void *bundled_args) | ||
{ | ||
void *args[] = {&bundled_args}; | ||
ATMI_LPARM_1D(lparm, wkitem_count_x); | ||
lparm->groupDim[0] = wkgrp_size_x; | ||
lparm->synchronous = ATMI_TRUE; | ||
|
||
lparm->kernel_id = GPU_KERNEL_IMPL; | ||
lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); | ||
atmi_task_launch(lparm, gpu_kernels[kernel_idx], args); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters