From e8d506217768d5b1e931973f8d21ab42df843443 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Sat, 4 Mar 2017 02:56:54 -0600 Subject: [PATCH] Extended Chapel tasking interface to use more of ATMI --- make/compiler/Makefile.hsa | 4 +- runtime/include/chpl-atmi.h | 5 +- runtime/include/gpu_base_header.h | 5 +- runtime/include/tasks/atmi/tasks-atmi.h | 67 ++++----- runtime/src/tasks/atmi/tasks-atmi.c | 181 +++++++++++++++++------- 5 files changed, 166 insertions(+), 96 deletions(-) diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa index d6dc08853c..84c749d717 100644 --- a/make/compiler/Makefile.hsa +++ b/make/compiler/Makefile.hsa @@ -6,8 +6,8 @@ CLOC=/opt/rocm/cloc/bin/cloc.sh LIBS+=-latmi_runtime -lm # TODO: move these in third-party directory? -GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib -HSA_INCLUDES=-I/opt/rocm/libatmi/include +GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/atmi/lib +HSA_INCLUDES=-I/opt/rocm/atmi/include else # HSA locations CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h index 6285760810..82c8a54780 100644 --- a/runtime/include/chpl-atmi.h +++ b/runtime/include/chpl-atmi.h @@ -13,12 +13,15 @@ atmi_kernel_t reduction_kernel; atmi_kernel_t *gpu_kernels; +atmi_kernel_t main_kernel; +int g_num_cpu_kernels; atmi_machine_t *g_machine; enum { GPU_KERNEL_IMPL = 10565, - REDUCTION_GPU_IMPL = 42 + REDUCTION_GPU_IMPL = 42, + CPU_FUNCTION_IMPL = 43 }; int chpl_hsa_initialize(void); diff --git a/runtime/include/gpu_base_header.h b/runtime/include/gpu_base_header.h index 9ebd8bc364..1dee3090a5 100644 --- a/runtime/include/gpu_base_header.h +++ b/runtime/include/gpu_base_header.h @@ -62,7 +62,10 @@ typedef uint_least8_t atomic_uint_least8_t; typedef uint_least16_t atomic_uint_least16_t; typedef uint_least32_t atomic_uint_least32_t; typedef uint_least64_t atomic_uint_least64_t; -typedef chpl_bool atomic_flag_n; +//typedef uintptr_t atomic_uintptr_t; +typedef chpl_bool atomic_bool; +typedef uint64_t atomic__real64; +typedef uint32_t atomic__real32; # define INT8_C(c) c diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h index 81ad131901..44adc76308 100644 --- a/runtime/include/tasks/atmi/tasks-atmi.h +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -30,7 +30,7 @@ #include "chpl-atmi.h" #include "chpl-tasks-prvdata.h" #include "chpltypes.h" - +#include "chpl-mem.h" #include "qthread.h" #include "qthread-chapel.h" @@ -50,8 +50,8 @@ void chpl_task_yield(void); // Type (and default value) used to communicate task identifiers // between C code and Chapel code in the runtime. // -typedef unsigned int chpl_taskID_t; -#define chpl_nullTaskID QTHREAD_NULL_TASK_ID +typedef uint64_t chpl_taskID_t; +#define chpl_nullTaskID ATMI_NULL_TASK_HANDLE // // Sync variables @@ -93,7 +93,7 @@ typedef struct { } chpl_task_bundle_t; // Structure of task-local storage -typedef struct chpl_qthread_tls_s { +typedef struct chpl_atmi_tls_s { chpl_task_bundle_t *bundle; // The below fields could move to chpl_task_bundleData_t // That would reduce the size of the task local storage, @@ -102,39 +102,23 @@ typedef struct chpl_qthread_tls_s { /* Reports */ int lock_filename; int lock_lineno; -} chpl_qthread_tls_t; +} chpl_atmi_tls_t; + +extern pthread_key_t tls_cache; extern pthread_t chpl_qthread_process_pthread; extern pthread_t chpl_qthread_comm_pthread; -extern chpl_qthread_tls_t chpl_qthread_process_tls; -extern chpl_qthread_tls_t chpl_qthread_comm_task_tls; +extern chpl_atmi_tls_t chpl_qthread_process_tls; +extern chpl_atmi_tls_t chpl_qthread_comm_task_tls; #define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized void chpl_task_stdModulesInitialized(void); -// Wrap qthread_get_tasklocal() and assert that it is always available. -static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void) -{ - chpl_qthread_tls_t* tls; - - if (chpl_qthread_done_initializing) { - tls = (chpl_qthread_tls_t*) - qthread_get_tasklocal(sizeof(chpl_qthread_tls_t)); - if (tls == NULL) { - pthread_t me = pthread_self(); - if (pthread_equal(me, chpl_qthread_comm_pthread)) - tls = &chpl_qthread_comm_task_tls; - else if (pthread_equal(me, chpl_qthread_process_pthread)) - tls = &chpl_qthread_process_tls; - } - assert(tls); - } - else - tls = NULL; +extern pthread_t null_thread; - return tls; -} +// Wrap qthread_get_tasklocal() and assert that it is always available. +extern chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void); #ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL #error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!" @@ -143,7 +127,7 @@ static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void) #endif static inline chpl_task_prvData_t* chpl_task_getPrvData(void) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); if (data) { return &data->prvdata; } @@ -162,7 +146,11 @@ static inline chpl_task_prvData_t* chpl_task_getPrvData(void) static inline c_sublocid_t chpl_task_getSubloc(void) { - return (c_sublocid_t) qthread_shep(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) + return data->bundle->requestedSubloc; + else + return c_sublocid_any; } #ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL @@ -173,8 +161,6 @@ c_sublocid_t chpl_task_getSubloc(void) static inline void chpl_task_setSubloc(c_sublocid_t subloc) { - qthread_shepherd_id_t curr_shep; - assert(subloc != c_sublocid_none); // Only change sublocales if the caller asked for a particular one, @@ -188,16 +174,10 @@ void chpl_task_setSubloc(c_sublocid_t subloc) // before tasking init and in any case would be done from the // main thread of execution, which doesn't have a shepherd. // The code below wouldn't work in that situation. - if ((curr_shep = qthread_shep()) != NO_SHEPHERD) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); - if (data) { - data->bundle->requestedSubloc = subloc; - } - - if (subloc != c_sublocid_any && - (qthread_shepherd_id_t) subloc != curr_shep) { - qthread_migrate_to((qthread_shepherd_id_t) subloc); - } + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) { + data->bundle->requestedSubloc = subloc; + printf("Setting ATMI requested subloc to %d\n", subloc); } } @@ -209,10 +189,11 @@ void chpl_task_setSubloc(c_sublocid_t subloc) static inline c_sublocid_t chpl_task_getRequestedSubloc(void) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); if (data) { return data->bundle->requestedSubloc; } + return c_sublocid_any; } diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c index a518957493..8f926c8738 100644 --- a/runtime/src/tasks/atmi/tasks-atmi.c +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -66,6 +66,11 @@ #include #include +#define max_num_cpu_kernels 4096 +int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; +atmi_kernel_t cpu_kernels[max_num_cpu_kernels]; + + #define OUTPUT_ATMI_STATUS(status, msg) \ { \ if (ATMI_STATUS_SUCCESS != (status)) { \ @@ -76,8 +81,6 @@ } \ } - - //#define SUPPORT_BLOCKREPORT //#define SUPPORT_TASKREPORT @@ -182,20 +185,22 @@ chpl_task_bundle_t chpl_qthread_comm_task_bundle = { .requested_fn = NULL, .id = chpl_nullTaskID }; -chpl_qthread_tls_t chpl_qthread_process_tls = { +chpl_atmi_tls_t chpl_qthread_process_tls = { .bundle = &chpl_qthread_process_bundle, .lock_filename = 0, .lock_lineno = 0 }; -chpl_qthread_tls_t chpl_qthread_comm_task_tls = { +chpl_atmi_tls_t chpl_qthread_comm_task_tls = { .bundle = &chpl_qthread_comm_task_bundle, .lock_filename = 0, .lock_lineno = 0 }; // -// chpl_qthread_get_tasklocal() is in tasks-qthreads.h +// chpl_atmi_get_tasklocal() is in tasks-qthreads.h // +pthread_key_t tls_cache; + static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; static volatile chpl_bool canCountRunningTasks = false; @@ -203,11 +208,11 @@ static volatile chpl_bool canCountRunningTasks = false; void chpl_task_yield(void) { PROFILE_INCR(profile_task_yield,1); - if (qthread_shep() == NO_SHEPHERD) { + /*if (qthread_shep() == NO_SHEPHERD) { sched_yield(); } else { qthread_yield(); - } + }*/ } // Sync variables @@ -251,10 +256,37 @@ void chpl_sync_unlock(chpl_sync_aux_t *s) qthread_incr(&s->lockers_out, 1); } +inline chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void) +{ + chpl_atmi_tls_t* tls = NULL; + + pthread_t me = pthread_self(); + if (pthread_equal(me, chpl_qthread_comm_pthread)) + tls = &chpl_qthread_comm_task_tls; + else if (pthread_equal(me, chpl_qthread_process_pthread)) + tls = &chpl_qthread_process_tls; + else { + atmi_task_handle_t t = get_atmi_task_handle(); + if(t != ATMI_NULL_TASK_HANDLE) { + tls = (chpl_atmi_tls_t *)pthread_getspecific(tls_cache); + if(tls == NULL) { + // FIXME: when to free? + tls = (chpl_atmi_tls_t *)chpl_mem_alloc(sizeof(chpl_atmi_tls_t), + CHPL_RT_MD_THREAD_PRV_DATA, + 0, 0); + pthread_setspecific(tls_cache, tls); + } + } + assert(tls != NULL); + } + + return tls; +} + static inline void about_to_block(int32_t lineno, int32_t filename) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); assert(data); data->lock_lineno = lineno; @@ -342,7 +374,7 @@ static void chapel_display_thread(qt_key_t addr, void *tls, void *callarg) { - chpl_qthread_tls_t *rep = (chpl_qthread_tls_t *)tls; + chpl_atmi_tls_t *rep = (chpl_atmi_tls_t *)tls; if (rep) { if ((rep->lock_lineno > 0) && rep->lock_filename) { @@ -683,8 +715,8 @@ static void setupTasklocalStorage(void) { // Make sure Qthreads knows how much space we need for per-task // local storage. tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0); - if (tasklocal_size < sizeof(chpl_qthread_tls_t)) { - snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_qthread_tls_t)); + if (tasklocal_size < sizeof(chpl_atmi_tls_t)) { + snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_atmi_tls_t)); chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1); } } @@ -698,13 +730,28 @@ static void setupWorkStealing(void) { chpl_qt_setenv("STEAL_RATIO", "0", 0); } +static aligned_t chapel_wrapper(void *arg); +static aligned_t main_wrapper(void *arg); +// If we stored chpl_taskID_t in chpl_task_bundleData_t, +// this struct and the following function may not be necessary. +typedef void (*main_ptr_t)(void); +typedef struct { + chpl_task_bundle_t arg; + main_ptr_t chpl_main; +} main_wrapper_bundle_t; + void chpl_task_init(void) { atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); - printf("ATMI task initialized\n"); if(st != ATMI_STATUS_SUCCESS) return; g_machine = atmi_machine_get_info(); + + pthread_key_create(&tls_cache, NULL); + //g_num_cpu_kernels = sizeof(chpl_ftable)/sizeof(chpl_ftable[0]); + size_t main_arg_size = sizeof(main_wrapper_bundle_t); + atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size); + atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL); int32_t commMaxThreads; int32_t hwpar; @@ -748,11 +795,18 @@ void chpl_task_init(void) void chpl_task_exit(void) { + //for(int i = 0; i < g_num_cpu_kernels; i++) { + for(int i = 0; i < max_num_cpu_kernels; i++) { + if(cpu_kernels_initialized[i]) + atmi_kernel_release(cpu_kernels[i]); + } + atmi_kernel_release(main_kernel); atmi_finalize(); #ifdef CHAPEL_PROFILE profile_print(); #endif /* CHAPEL_PROFILE */ + pthread_key_delete(tls_cache); if (qthread_shep() == NO_SHEPHERD) { /* sometimes, tasking is told to shutdown even though it hasn't been * told to start yet */ @@ -781,20 +835,12 @@ static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, } -// If we stored chpl_taskID_t in chpl_task_bundleData_t, -// this struct and the following function may not be necessary. -typedef void (*main_ptr_t)(void); -typedef struct { - chpl_task_bundle_t arg; - main_ptr_t chpl_main; -} main_wrapper_bundle_t; - static aligned_t main_wrapper(void *arg) { - chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg; chpl_task_bundle_t *bundle = &m_bundle->arg; - chpl_qthread_tls_t pv = {.bundle = bundle}; + chpl_atmi_tls_t pv = {.bundle = bundle}; *tls = pv; @@ -809,18 +855,17 @@ static aligned_t main_wrapper(void *arg) return 0; } - static aligned_t chapel_wrapper(void *arg) { - chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; - chpl_qthread_tls_t pv = {.bundle = bundle}; + chpl_atmi_tls_t pv = {.bundle = bundle}; *tls = pv; if (bundle->countRunning) chpl_taskRunningCntInc(0, 0); - + wrap_callbacks(chpl_task_cb_event_kind_begin, bundle); // launch GPU kernel here? @@ -869,8 +914,18 @@ void chpl_task_callMain(void (*chpl_main)(void)) wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg); - qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret); - qthread_syncvar_readFF(NULL, &exit_ret); + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + //void *arg1 = (void *)&arg; + kernargs[0] = (void *)&arg; + //atmi_task_launch(lparm, main_kernel, kernargs); + + main_wrapper(&arg); + //qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret); + //qthread_syncvar_readFF(NULL, &exit_ret); } void chpl_task_stdModulesInitialized(void) @@ -911,6 +966,7 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, int lineno, int32_t filename) { + //printf("Adding %d fn to task list\n", fid); chpl_bool serial_state = chpl_task_getSerial(); chpl_fn_p requested_fn = chpl_ftable[fid]; @@ -935,12 +991,24 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, wrap_callbacks(chpl_task_cb_event_kind_create, arg); - if (subloc == c_sublocid_any) { + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + //lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + if(!cpu_kernels_initialized[fid]) { + atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); + atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL); + cpu_kernels_initialized[fid] = 1; + } + kernargs[0] = (void *)arg; + atmi_task_launch(lparm, cpu_kernels[fid], kernargs); + /*if (subloc == c_sublocid_any) { qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); } else { qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, NULL, (qthread_shepherd_id_t) subloc); - } + }*/ } } @@ -954,6 +1022,7 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p c_sublocid_t subloc, chpl_bool serial_state, int lineno, int32_t filename) { + //printf("Adding %d fn to task call body\n", fid); chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; bundle->serial_state = serial_state; @@ -969,12 +1038,24 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p wrap_callbacks(chpl_task_cb_event_kind_create, bundle); - if (subloc < 0) { + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + //lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + if(!cpu_kernels_initialized[fid]) { + atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); + atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL); + cpu_kernels_initialized[fid] = 1; + } + kernargs[0] = (void *)arg; + atmi_task_launch(lparm, cpu_kernels[fid], kernargs); + /*if (subloc < 0) { qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); } else { qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, - NULL, (qthread_shepherd_id_t) subloc); - } + NULL, (qthread_shepherd_id_t) 0); + }*/ } void chpl_task_taskCallFTable(chpl_fn_int_t fid, @@ -1024,20 +1105,13 @@ void chpl_task_startMovedTask(chpl_fn_int_t fid, // Returns '(unsigned int)-1' if called outside of the tasking layer. chpl_taskID_t chpl_task_getId(void) { - chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); - chpl_taskID_t *id_ptr = NULL; - PROFILE_INCR(profile_task_getId,1); - if (tls == NULL) + atmi_task_handle_t t = get_atmi_task_handle(); + if(t == ATMI_NULL_TASK_HANDLE) return (chpl_taskID_t) -1; - id_ptr = &tls->bundle->id; - - if (*id_ptr == chpl_nullTaskID) - *id_ptr = qthread_incr(&next_task_id, 1); - - return *id_ptr; + return t; } void chpl_task_sleep(double secs) @@ -1077,18 +1151,27 @@ void chpl_task_sleep(double secs) /* The get- and setSerial() methods assume the beginning of the task-local * data segment holds a chpl_bool denoting the serial state. */ +#if 0 +chpl_bool get_serial(chpl_taskID_t id) { + if(g_task_map.find(id) == g_task_map.end()) + return true; + else + return g_task_map[id]; +} +#endif + chpl_bool chpl_task_getSerial(void) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); PROFILE_INCR(profile_task_getSerial,1); - + return data->bundle->serial_state; } void chpl_task_setSerial(chpl_bool state) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); data->bundle->serial_state = state; PROFILE_INCR(profile_task_setSerial,1); @@ -1096,14 +1179,14 @@ void chpl_task_setSerial(chpl_bool state) uint32_t chpl_task_getMaxPar(void) { //chpl_internal_error("Qthreads max tasks par asked\n"); - printf("Qthreads max task par asked\n"); + //printf("Qthreads max task par asked\n"); // // We assume here that the caller (in the LocaleModel module code) // is interested in the number of workers on the whole node, and // will decide itself how much parallelism to create across and // within sublocales, if there are any. // - return (uint32_t) qthread_num_workers(); + return (uint32_t) 8;//qthread_num_workers(); } c_sublocid_t chpl_task_getNumSublocales(void) @@ -1145,7 +1228,7 @@ int32_t chpl_task_getNumBlockedTasks(void) uint32_t chpl_task_getNumThreads(void) { - return (uint32_t)qthread_num_workers(); + return (uint32_t) 8;//qthread_num_workers(); } // Ew. Talk about excessive bookkeeping.