Skip to content

Commit

Permalink
Adding ATMI as the runtime layer to launch Chapel's generated GPU ker…
Browse files Browse the repository at this point in the history
…nels
  • Loading branch information
ashwinma committed Feb 9, 2017
1 parent 42d8a3a commit 0388d95
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 6 deletions.
6 changes: 3 additions & 3 deletions make/compiler/Makefile.hsa
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu
ifdef CHPL_ROCM
# ROCm locations
CLOC=/opt/rocm/cloc/bin/cloc.sh
LIBS+=-lhsa-runtime64 -lhsakmt -lm
LIBS+=-latmi_runtime -lm

# TODO: move these in third-party directory?
GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib
HSA_INCLUDES=-I/opt/rocm/hsa/include
GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib
HSA_INCLUDES=-I/opt/rocm/libatmi/include
else
# HSA locations
CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh
Expand Down
40 changes: 40 additions & 0 deletions runtime/include/chpl-atmi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef _chpl_atmi_h_
#define _chpl_atmi_h_

#include <atmi_runtime.h>
#include <stddef.h> /* size_t */
#include <stdint.h> /* uintXX_t */
#ifndef __cplusplus
#include <stdbool.h>
#endif /* __cplusplus */

#include "chpltypes.h"
#include "chpl-hsa-kernelparams.h"

atmi_kernel_t reduction_kernel;
atmi_kernel_t *gpu_kernels;

enum {
GPU_KERNEL_IMPL = 10565,
REDUCTION_GPU_IMPL = 42
};
/*
typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
uint64_t in;
uint64_t out;
uint32_t count;
} hsail_reduce_kernarg_t;
typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
uint64_t bundle;
} hsail_kernarg_t;
*/

int chpl_hsa_initialize(void);

int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count);
int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count);

void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
uint32_t wkitem_count_x, void *bundled_args);
#endif //_chpl_atmi_h_
2 changes: 1 addition & 1 deletion runtime/include/chpl-gen-includes.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#include "chpl-tasks.h"
#include "chpltypes.h"
#ifdef TARGET_HSA
#include "chpl-hsa.h"
#include "chpl-atmi.h"
#endif

//
Expand Down
4 changes: 2 additions & 2 deletions runtime/src/Makefile.share
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa)
HSA_SRCS = \
chpl-hsa.c \
chpl-atmi.c \
chpl-hsa-reducekernels.cl \
chpl-hsa-reducehost.c
chpl-atmi-reducehost.c
endif

COMMON_LAUNCHER_SRCS = \
Expand Down
136 changes: 136 additions & 0 deletions runtime/src/chpl-atmi-reducehost.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@

#include "chpl-atmi.h"
#include "chplrt.h"
#include "chplexit.h"
#include "chpl-mem.h"

/*enum ReduceOp {
MAX,
MIN,
SUM,
PROD,
BITAND,
BITOR,
BITXOR,
LOGAND,
LOGOR
};
*/

/*
* Estimate and schedule the required number of GPU kernels
*/
static inline
void atmi_sched_reducekernels(size_t count,
void *darray[2], size_t *iter_ct,
size_t *items_left)
{
size_t incount, outcount, i, iter, in, out;
uint32_t max_num_wkgrps, num_wkgroups, grid_size_x;

const int num_args = 3;
atmi_task_group_t task_group = {1, ATMI_TRUE};
ATMI_LPARM(lparm);
lparm->group = &task_group;
lparm->kernel_id = REDUCTION_GPU_IMPL;
lparm->synchronous = ATMI_FALSE;
lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);

incount = count;
max_num_wkgrps = incount / WKGRP_SIZE;
num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE;
grid_size_x = num_wkgroups * WKGRP_SIZE;
outcount = num_wkgroups;
iter = 0;
while (grid_size_x > WKGRP_SIZE) {
in = (iter & 1);
out = (iter & 1) ^ 1;

void *args[] = {&darray[in], &darray[out], &incount};
lparm->gridDim[0] = grid_size_x;
lparm->groupDim[0] = WKGRP_SIZE;
atmi_task_launch(lparm, reduction_kernel, args);

iter += 1;
incount = outcount;
max_num_wkgrps = incount / WKGRP_SIZE;
num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE;
grid_size_x = num_wkgroups * WKGRP_SIZE;
outcount = num_wkgroups;
}

if (iter > 0) {
atmi_task_group_sync(&task_group);
}

(*items_left) = incount;
(*iter_ct) = iter;
}

/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count)
{
int32_t res;
size_t iter, items_left, out, i;
int32_t * darray[2];
hsa_symbol_info_t * symbol_info;
symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index
darray[0] = src;
if (0 != chpl_posix_memalign((void **) &darray[1], 64,
count * sizeof(int32_t))) {
chpl_exit_any(1);
}
hsa_sched_reducekernels(count, symbol_info, (void**)darray,
&iter, &items_left);
res = 0;
out = (iter & 1);
chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
for (i = 0; i < items_left; ++i) res += darray[out][i];
chpl_free (darray[1]);
return res;
}*/

int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count)
{
int64_t res;
size_t iter, items_left, out, i;
int64_t * darray[2];
darray[0] = src;
if (0 != chpl_posix_memalign((void **) &darray[1], 64,
count * sizeof(int64_t))) {
chpl_exit_any(1);
}

atmi_sched_reducekernels(count, (void**)darray,
&iter, &items_left);

res = 0;
out = (iter & 1);
chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
for (i = 0; i < items_left; ++i) res += darray[out][i];

chpl_free (darray[1]);
return res;
}

//FIXME: use the op argument like this to extend this to different ops
/*if (!strcasecmp(op, "Max"))
opType = MAX;
else if (!strcasecmp(op, "Min"))
opType = MIN;
else if (!strcasecmp(op, "Sum"))
opType = SUM;
else if (!strcasecmp(op, "Product"))
opType = PROD;
else if (!strcasecmp(op, "LogicalAnd"))
opType = LOGAND;
else if (!strcasecmp(op, "LogicalOr"))
opType = LOGOR;
else if (!strcasecmp(op, "BitwiseAnd"))
opType = BITAND;
else if (!strcasecmp(op, "BitwiseOr"))
opType = BITOR;
else if (!strcasecmp(op, "BitwiseXor"))
opType = BITXOR; */
116 changes: 116 additions & 0 deletions runtime/src/chpl-atmi.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <sys/time.h>
#include "chpl-atmi.h"
#include "chplrt.h"
#include "chpl-mem.h"
#include "chplcgfns.h"

#define OUTPUT_ATMI_STATUS(status, msg) \
{ \
if (ATMI_STATUS_SUCCESS != (status)) { \
fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
#msg, status); \
atmi_finalize(); \
return status; \
} \
}

/**
* Initialize the ATMI/HSA runtime
*/
int chpl_hsa_initialize(void)
{
atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
if(st != ATMI_STATUS_SUCCESS) return -1;

char reduce_kernel_filename[1024];
char gen_kernel_filename[1024];
int arglen = strlen(chpl_executionCommand)+1;
char* argCopy = chpl_mem_allocMany(arglen, sizeof(char),
CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0);
char *binName;
int cx;

cx = snprintf(reduce_kernel_filename, 1024,
#ifdef ROCM
"%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME,
#else
"%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME,
#endif
CHPL_RUNTIME_OBJDIR);
if (cx < 0 || cx >= 256) {
OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename);
}
strcpy(argCopy, chpl_executionCommand);
binName = strtok(argCopy, " ");
#ifdef ROCM
cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName);
#else
cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName);
#endif
if (cx < 0 || cx >= 256) {
OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename);
}
chpl_mem_free(argCopy, 0, 0);

#ifdef ROCM
atmi_platform_type_t module_type = AMDGCN;
#else
atmi_platform_type_t module_type = BRIG;
#endif

/* FIXME: Create all reduction kernels, not just the int64-sum kernel */
const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename};
atmi_platform_type_t module_types[2] = {module_type, module_type};
st = atmi_module_register(modules, module_types, 2);
OUTPUT_ATMI_STATUS(st, Registering all modules);

size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)};
const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]);
atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes);
atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL);

size_t kernel_arg_sizes[] = {sizeof(uint64_t)};
const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]);
gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels);
for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) {
//FIXME: get the actual kernel name
const char *kernel_name = chpl_gpu_kernels[i];
atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes);
atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL);
}

return ATMI_STATUS_SUCCESS;
}

/**
* Release resources used by the base kernels and tear down the HSA structures
*/
int hsa_shutdown(void)
{
chpl_free(gpu_kernels);
atmi_finalize();
}

/*
* Enqueue/execute a kernel
*/
void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
uint32_t wkitem_count_x, void *bundled_args)
{
void *args[] = {&bundled_args};
ATMI_LPARM_1D(lparm, wkitem_count_x);
lparm->groupDim[0] = wkgrp_size_x;
lparm->synchronous = ATMI_TRUE;

lparm->kernel_id = GPU_KERNEL_IMPL;
lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
atmi_task_launch(lparm, gpu_kernels[kernel_idx], args);
}

4 changes: 4 additions & 0 deletions util/setchplenv_hsa.bash
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ if [ "$1" == "debug" ]; then
export CHPL_DEBUG=1
fi

echo -n "Setting CHPL_ROCM"
export CHPL_ROCM=1
echo " to 1"

echo -n "Setting CHPL_LOCALE_MODEL"
export CHPL_LOCALE_MODEL=hsa
echo " to hsa"
Expand Down

0 comments on commit 0388d95

Please sign in to comment.