diff --git a/make/tasks/Makefile.atmi b/make/tasks/Makefile.atmi new file mode 100644 index 0000000000..00ca53fb62 --- /dev/null +++ b/make/tasks/Makefile.atmi @@ -0,0 +1,19 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_INCLS += -I$(QTHREAD_INCLUDE_DIR) +CHPL_MAKE_THREADS=none diff --git a/runtime/Makefile.help b/runtime/Makefile.help index 00e96f04c4..a6d84c1ff5 100644 --- a/runtime/Makefile.help +++ b/runtime/Makefile.help @@ -168,4 +168,7 @@ include $(RUNTIME_ROOT)/make/Makefile.runtime.foot src/tasks/qthreads/Makefile.include: cd src/tasks/qthreads && $(MAKE) copy_qthread_files +src/tasks/atmi/Makefile.include: + cd src/tasks/atmi && $(MAKE) copy_qthread_files + .NOTPARALLEL: diff --git a/runtime/etc/Makefile.tasks-atmi b/runtime/etc/Makefile.tasks-atmi new file mode 100644 index 0000000000..ce328fd06b --- /dev/null +++ b/runtime/etc/Makefile.tasks-atmi @@ -0,0 +1,19 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +GEN_CFLAGS += -I$(QTHREAD_INCLUDE_DIR) +GEN_LFLAGS += -L$(QTHREAD_LIB_DIR) -Wl,-rpath,$(QTHREAD_LIB_DIR) diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h new file mode 100644 index 0000000000..58dab0a982 --- /dev/null +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -0,0 +1,445 @@ +/************************************************************************** +* Copyright 2011 Sandia Corporation. Under the terms of Contract +* DE-AC04-94AL85000, there is a non-exclusive license for use of this work by +* or on behalf of the U.S. Government. Export of this program may require a +* license from the United States Government +**************************************************************************/ + +/* + * Copyright 2004-2015 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _tasks_qthreads_h_ +#define _tasks_qthreads_h_ + +#include + +#include +#include +#include +#include + +#include "chpltypes.h" +#include "chpl-tasks-prvdata.h" + +#define CHPL_COMM_YIELD_TASK_WHILE_POLLING +void chpl_task_yield(void); + +// For mutexes +// type(s) +// threadlayer_mutex_t +// threadlayer_mutex_p +// functions +// threadlayer_mutex_init() +// threadlayer_mutex_new() +// threadlayer_mutex_lock() +// threadlayer_mutex_unlock() +// +// For thread management +// type(s) +// +// functions +// threadlayer_thread_id() +// threadlayer_thread_cancel() +// threadlayer_thread_join() +// +// For sync variables +// type(s) +// threadlayer_sync_aux_t +// functions +// threadlayer_sync_suspend() +// threadlayer_sync_awaken() +// threadlayer_sync_init() +// threadlayer_sync_destroy() +// +// For task management +// type(s) +// +// functions +// threadlayer_init() +// threadlayer_thread_create() +// threadlayer_pool_suspend() +// threadlayer_pool_awaken() +// threadlayer_get_thread_private_data() +// threadlayer_set_thread_private_data() +// +// The types are declared in the threads-*.h file for each specific +// threading layer, and the callback functions are declared here. The +// interfaces and requirements for these other types and callback +// functions are described elsewhere in this file. +// +// Although the above list may seem long, in practice many of the +// functions are quite simple, and with luck also easily extrapolated +// from what is done for other threading layers. For an example of an +// implementation, see "pthreads" threading. +// + +// +// Type (and default value) used to communicate task identifiers +// between C code and Chapel code in the runtime. +// +typedef unsigned int chpl_taskID_t; +#define chpl_nullTaskID QTHREAD_NULL_TASK_ID + +// +// Mutexes +// +typedef syncvar_t chpl_mutex_t; + +// +// Sync variables +// +// The threading layer's threadlayer_sync_aux_t may include any +// additional members the layer needs to support the suspend/awaken +// callbacks efficiently. The FIFO tasking code itself does not +// refer to this type or the tl_aux member at all. +// +typedef struct { + aligned_t lockers_in; + aligned_t lockers_out; + uint_fast32_t uncontested_locks; + int is_full; + syncvar_t signal_full; + syncvar_t signal_empty; +} chpl_sync_aux_t; + +#define chpl_sync_reset(x) qthread_syncvar_empty(&(x)->sync_aux.signal_full) + +#define chpl_read_FE(x) ({ \ + uint64_t y; \ + qthread_syncvar_readFE(&y, &(x)->sync_aux.signal_full); \ + y; }) + +#define chpl_read_FF(x) ({ \ + uint64_t y; \ + qthread_syncvar_readFF(&y, &(x)->sync_aux.signal_full); \ + y; }) + +#define chpl_read_XX(x) ((x)->sync_aux.signal_full.u.s.data) + +#define chpl_write_EF(x, y) do { \ + uint64_t z = (uint64_t)(y); \ + qthread_syncvar_writeEF(&(x)->sync_aux.signal_full, &z); \ +} while(0) + +#define chpl_write_FF(x, y) do { \ + uint64_t z, dummy; \ + z = (uint64_t)(y); \ + qthread_syncvar_readFE(&dummy, &(x)->sync_aux.signal_full); \ + qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \ +} while(0) + +#define chpl_write_XF(x, y) do { \ + uint64_t z = (uint64_t)(y); \ + qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \ +} while(0) + +#define chpl_single_reset(x) qthread_syncvar_empty(&(x)->single_aux.signal_full) + +#define chpl_single_read_FF(x) ({ \ + uint64_t y; \ + qthread_syncvar_readFF(&y, &(x)->single_aux.signal_full); \ + y; }) + +#define chpl_single_write_EF(x, y) do { \ + uint64_t z = (uint64_t)(y); \ + qthread_syncvar_writeEF(&(x)->single_aux.signal_full, &z); \ +} while(0) + +#define chpl_single_read_XX(x) ((x)->single_aux.signal_full.u.s.data) + +// Tasks + +// +// Handy services for threading layer callback functions. +// +// The FIFO tasking implementation also provides the following service +// routines that can be used by threading layer callback functions. +// + +// +// The remaining declarations are all for callback functions to be +// provided by the threading layer. +// + +// +// These are called once each, from CHPL_TASKING_INIT() and +// CHPL_TASKING_EXIT(). +// +void threadlayer_init(void); +void threadlayer_exit(void); + +// +// Mutexes +// +/*void qthread_mutex_init(qthread_mutex_p); + * qthread_mutex_p qthread_mutex_new(void); + * void qthread_mutex_lock(qthread_mutex_p); + * void qthread_mutex_unlock(qthread_mutex_p);*/ + +// +// Sync variables +// +// The CHPL_SYNC_WAIT_{FULL,EMPTY}_AND_LOCK() functions should call +// threadlayer_sync_suspend() when a sync variable is not in the desired +// full/empty state. The call will be made with the sync variable's +// mutex held. (Thus, threadlayer_sync_suspend() can dependably tell +// that the desired state must be the opposite of the state it initially +// sees the variable in.) It should return (with the mutex again held) +// as soon as it can once either the sync variable changes to the +// desired state, or (if the given deadline pointer is non-NULL) the +// deadline passes. It can return also early, before either of these +// things occur, with no ill effects. If a deadline is given and it +// does pass, then threadlayer_sync_suspend() must return true; +// otherwise false. +// +// The less the function can execute while waiting for the sync variable +// to change state, and the quicker it can un-suspend when the variable +// does change state, the better overall performance will be. Obviously +// the sync variable's mutex must be unlocked while the routine waits +// for the variable to change state or the deadline to pass, or livelock +// may result. +// +// The CHPL_SYNC_MARK_AND_SIGNAL_{FULL,EMPTY}() functions will call +// threadlayer_sync_awaken() every time they are called, not just when +// they change the state of the sync variable. +// +// Threadlayer_sync_{init,destroy}() are called to initialize or +// destroy, respectively, the contents of the tl_aux member of the +// chpl_sync_aux_t for the specific threading layer. +// +/*chpl_bool threadlayer_sync_suspend(chpl_sync_aux_t *s, + * struct timeval *deadline); + * void threadlayer_sync_awaken(chpl_sync_aux_t *s); + * void threadlayer_sync_init(chpl_sync_aux_t *s); + * void threadlayer_sync_destroy(chpl_sync_aux_t *s);*/ + +// +// Task management +// + +// +// The interface for thread creation may need to be extended eventually +// to allow for specifying such things as stack sizes and/or locations. +// +/*int threadlayer_thread_create(threadlayer_threadID_t*, void*(*)(void*), void*);*/ + +// +// Threadlayer_pool_suspend() is called when a thread finds nothing in +// the pool of unclaimed tasks, and so has no work to do. The call will +// be made with the pointed-to mutex held. It should return (with the +// mutex again held) as soon as it can once either the task pool is no +// longer empty or (if the given deadline pointer is non-NULL) the +// deadline passes. It can return also early, before either of these +// things occur, with no ill effects. If a deadline is given and it +// does pass, then threadlayer_pool_suspend() must return true; +// otherwise false. +// +// The less the function can execute while waiting for the pool to +// become nonempty, and the quicker it can un-suspend when that happens, +// the better overall performance will be. +// +// The mutex passed to threadlayer_pool_suspend() is the one that +// provides mutual exclusion for changes to the task pool. Allowing +// access to this mutex simplifies the implementation for certain +// threading layers, such as those based on pthreads condition +// variables. However, it also introduces a complication in that it +// allows a threading layer to create deadlock or livelock situations if +// it is not careful. Certainly the mutex must be unlocked while the +// routine waits for the task pool to fill or the deadline to pass, or +// livelock may result. +// +/*chpl_bool threadlayer_pool_suspend(chpl_mutex_t*, struct timeval*); + * void threadlayer_pool_awaken(void);*/ + +// +// Thread private data +// +// These set and get a pointer to thread private data associated with +// each thread. This is for the use of the FIFO tasking implementation +// itself. If the threading layer also needs to store some data private +// to each thread, it must make other arrangements to do so. +// +/*void threadlayer_set_thread_private_data(void*); + * void* threadlayer_get_thread_private_data(void);*/ + + +#ifndef QTHREAD_MULTINODE +extern +#ifdef __cplusplus +"C" +#endif +volatile int chpl_qthread_done_initializing; +#endif + +typedef struct { + c_string task_filename; + int task_lineno; + chpl_taskID_t id; + chpl_bool is_executeOn; + c_sublocid_t requestedSubloc; // requested sublocal for task + chpl_task_prvData_t prvdata; +} chpl_task_prvDataImpl_t; + +// Define PRV_DATA_IMPL_VAL to set up a chpl_task_prvData_t. +#define PRV_DATA_IMPL_VAL(_fn, _ln, _id, _is_execOn, _subloc, _serial) \ + { .task_filename = _fn, \ + .task_lineno = _ln, \ + .id = _id, \ + .is_executeOn = _is_execOn, \ + .requestedSubloc = _subloc, \ + .prvdata = { .serial_state = _serial } } + +typedef struct { + void *fn; + void *args; + chpl_bool countRunning; + chpl_task_prvDataImpl_t chpl_data; +} chpl_qthread_wrapper_args_t; + +// Structure of task-local storage +typedef struct chpl_qthread_tls_s { + /* Task private data: serial state, etc. */ + chpl_task_prvDataImpl_t chpl_data; + /* Reports */ + c_string lock_filename; + size_t lock_lineno; +} chpl_qthread_tls_t; + +extern pthread_t chpl_qthread_process_pthread; +extern pthread_t chpl_qthread_comm_pthread; + +extern chpl_qthread_tls_t chpl_qthread_process_tls; +extern chpl_qthread_tls_t chpl_qthread_comm_task_tls; + +#define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized +void chpl_task_stdModulesInitialized(void); + +// Wrap qthread_get_tasklocal() and assert that it is always available. +static inline chpl_qthread_tls_t * chpl_qthread_get_tasklocal(void) +{ + chpl_qthread_tls_t* tls; + + if (chpl_qthread_done_initializing) { + tls = (chpl_qthread_tls_t *) + qthread_get_tasklocal(sizeof(chpl_qthread_tls_t)); + if (tls == NULL) { + pthread_t me = pthread_self(); + if (pthread_equal(me, chpl_qthread_comm_pthread)) + tls = &chpl_qthread_comm_task_tls; + else if (pthread_equal(me, chpl_qthread_process_pthread)) + tls = &chpl_qthread_process_tls; + } + assert(tls); + } + else + tls = NULL; + + return tls; +} + +#ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL +#error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GET_PRVDATA_IMPL_DECL 1 +#endif +static inline chpl_task_prvData_t* chpl_task_getPrvData(void) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + if (data) { + return &data->chpl_data.prvdata; + } + assert(data); + return NULL; +} + +#ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL +#error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GETSUBLOC_IMPL_DECL 1 +#endif +static inline +c_sublocid_t chpl_task_getSubloc(void) +{ + return (c_sublocid_t) qthread_shep(); +} + +#ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL +#error "CHPL_TASK_SETSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_SETSUBLOC_IMPL_DECL 1 +#endif +static inline +void chpl_task_setSubloc(c_sublocid_t subloc) +{ + qthread_shepherd_id_t curr_shep; + + assert(subloc != c_sublocid_none); + + // Only change sublocales if the caller asked for a particular one, + // which is not the current one, and we're a (movable) task. + // + // Note: It's likely that this won't work in all cases where we need + // it. In particular, we envision needing to move execution + // from sublocale to sublocale while initializing the memory + // layer, in order to get the NUMA domain affinity right for + // the subparts of the heap. But this will be happening well + // before tasking init and in any case would be done from the + // main thread of execution, which doesn't have a shepherd. + // The code below wouldn't work in that situation. + if ((curr_shep = qthread_shep()) != NO_SHEPHERD) { + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + if (data) { + data->chpl_data.requestedSubloc = subloc; + } + + if (subloc != c_sublocid_any && + (qthread_shepherd_id_t) subloc != curr_shep) { + qthread_migrate_to((qthread_shepherd_id_t) subloc); + } + } +} + +#ifdef CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL +#error "CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL 1 +#endif +static inline +c_sublocid_t chpl_task_getRequestedSubloc(void) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + if (data) { + return data->chpl_data.requestedSubloc; + } + return c_sublocid_any; +} + +#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL +#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1 +#endif +extern int chpl_qthread_supports_remote_cache; +static inline +int chpl_task_supportsRemoteCache(void) { + return chpl_qthread_supports_remote_cache; +} + +#endif // ifndef _tasks_qthreads_h_ +/* vim:set expandtab: */ diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile new file mode 100644 index 0000000000..cb1bf64265 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile @@ -0,0 +1,39 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_ROOT = ../../.. +RUNTIME_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) + +ifndef CHPL_MAKE_HOME +export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/.. +endif + +include $(RUNTIME_ROOT)/make/Makefile.runtime.head + +TASKS_OBJDIR = $(RUNTIME_OBJDIR) +include Makefile.share + +TARGETS = $(TASKS_OBJS) + +include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules + +FORCE: + +# +# standard footer +# +include $(RUNTIME_ROOT)/make/Makefile.runtime.foot diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include new file mode 100644 index 0000000000..a510dc9a60 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile.include @@ -0,0 +1,29 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) + +TASKS_OBJDIR = $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/$(RUNTIME_OBJDIR) + +# +# point to sources under third-party, at least until they're +# contributed back to the usual local directory +# +ALL_SRCS += $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.c \ + $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.h + +include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share new file mode 100644 index 0000000000..374f9cc760 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile.share @@ -0,0 +1,25 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TASKS_SRCS = + +SVN_SRCS = $(TASKS_SRCS) +SRCS = $(SVN_SRCS) + +TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o) + +RUNTIME_CFLAGS += -I$(QTHREAD_INCLUDE) diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c new file mode 100644 index 0000000000..8902c2949e --- /dev/null +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -0,0 +1,963 @@ +// +// Qthreads implementation of Chapel tasking interface +// +// Copyright 2011 Sandia Corporation. Under the terms of Contract +// DE-AC04-94AL85000, there is a non-exclusive license for use of this work by +// or on behalf of the U.S. Government. Export of this program may require a +// license from the United States Government +// + +/* + * Copyright 2004-2015 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// For SVID definitions (setenv) +#define _SVID_SOURCE + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "chplrt.h" +#include "chplsys.h" +#include "tasks-qthreads.h" +//#include "tasks-atmi.h" +#include "chplcgfns.h" +#include "chpl-comm.h" +#include "chpl-locale-model.h" +#include "chpl-tasks.h" +#include "chpl-tasks-callbacks-internal.h" +#include "config.h" +#include "error.h" +#include "arg.h" +#include "signal.h" +#include "chplexit.h" +#include +#include +#include +#include +#include +#include +#include +#include + +//#include + +#if 1 +#include "qthread/qthread.h" +#include "qthread/qtimer.h" +#include "qt_feb.h" +#include "qt_syncvar.h" +#include "qt_hash.h" +#include "qt_atomics.h" +#include "qt_shepherd_innards.h" +#include "qt_envariables.h" +#include "qt_debug.h" + +#ifdef QTHREAD_MULTINODE +#include "qthread/spr.h" +#endif /* QTHREAD_MULTINODE */ + +#include +#endif +#ifdef CHAPEL_PROFILE +# define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0) + +/* Tasks */ +static aligned_t profile_task_yield = 0; +static aligned_t profile_task_addToTaskList = 0; +static aligned_t profile_task_processTaskList = 0; +static aligned_t profile_task_executeTasksInList = 0; +static aligned_t profile_task_freeTaskList = 0; +static aligned_t profile_task_startMovedTask = 0; +static aligned_t profile_task_getId = 0; +static aligned_t profile_task_sleep = 0; +static aligned_t profile_task_getSerial = 0; +static aligned_t profile_task_setSerial = 0; +static aligned_t profile_task_getCallStackSize = 0; +/* Sync */ +static aligned_t profile_sync_lock= 0; +static aligned_t profile_sync_unlock= 0; +static aligned_t profile_sync_waitFullAndLock= 0; +static aligned_t profile_sync_waitEmptyAndLock= 0; +static aligned_t profile_sync_markAndSignalFull= 0; +static aligned_t profile_sync_markAndSignalEmpty= 0; +static aligned_t profile_sync_isFull= 0; +static aligned_t profile_sync_initAux= 0; +static aligned_t profile_sync_destroyAux= 0; + +static void profile_print(void) +{ + /* Tasks */ + fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield); + fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList); + fprintf(stderr, "task processTaskList: %lu\n", (unsigned long)profile_task_processTaskList); + fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList); + fprintf(stderr, "task freeTaskList: %lu\n", (unsigned long)profile_task_freeTaskList); + fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask); + fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId); + fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep); + fprintf(stderr, "task getSerial: %lu\n", (unsigned long)profile_task_getSerial); + fprintf(stderr, "task setSerial: %lu\n", (unsigned long)profile_task_setSerial); + fprintf(stderr, "task getCallStackSize: %lu\n", (unsigned long)profile_task_getCallStackSize); + /* Sync */ + fprintf(stderr, "sync lock: %lu\n", (unsigned long)profile_sync_lock); + fprintf(stderr, "sync unlock: %lu\n", (unsigned long)profile_sync_unlock); + fprintf(stderr, "sync waitFullAndLock: %lu\n", (unsigned long)profile_sync_waitFullAndLock); + fprintf(stderr, "sync waitEmptyAndLock: %lu\n", (unsigned long)profile_sync_waitEmptyAndLock); + fprintf(stderr, "sync markAndSignalFull: %lu\n", (unsigned long)profile_sync_markAndSignalFull); + fprintf(stderr, "sync markAndSignalEmpty: %lu\n", (unsigned long)profile_sync_markAndSignalEmpty); + fprintf(stderr, "sync isFull: %lu\n", (unsigned long)profile_sync_isFull); + fprintf(stderr, "sync initAux: %lu\n", (unsigned long)profile_sync_initAux); + fprintf(stderr, "sync destroyAux: %lu\n", (unsigned long)profile_sync_destroyAux); +} +#else +# define PROFILE_INCR(counter,count) +#endif /* CHAPEL_PROFILE */ + +#ifndef QTHREAD_MULTINODE +volatile int chpl_qthread_done_initializing; +#endif + +// Make qt env sizes uniform. Same as qt, but they use the literal everywhere +#define QT_ENV_S 100 + +// aka chpl_task_list_p +struct chpl_task_list { + chpl_fn_p fun; + void *arg; + c_string filename; + int lineno; + chpl_task_list_p next; +}; + +static aligned_t next_task_id = 1; + +pthread_t chpl_qthread_process_pthread; +pthread_t chpl_qthread_comm_pthread; + +chpl_qthread_tls_t chpl_qthread_process_tls = { + PRV_DATA_IMPL_VAL("
", 0, chpl_nullTaskID, false, + c_sublocid_any_val, false), + NULL, 0 }; + +chpl_qthread_tls_t chpl_qthread_comm_task_tls = { + PRV_DATA_IMPL_VAL("", 0, chpl_nullTaskID, false, + c_sublocid_any_val, false), + NULL, 0 }; + +// +// QTHREADS_SUPPORTS_REMOTE_CACHE is set in the Chapel Qthreads +// Makefile, based on the Qthreads scheduler configuration. +// +int chpl_qthread_supports_remote_cache = QTHREADS_SUPPORTS_REMOTE_CACHE; + +// +// structs chpl_task_prvDataImpl_t, chpl_qthread_wrapper_args_t and +// chpl_qthread_tls_t have been moved to tasks-qthreads.h +// + +// +// chpl_qthread_get_tasklocal() is in tasks-qthreads.h +// + +static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; + +static volatile chpl_bool canCountRunningTasks = false; + +void chpl_task_yield(void) +{ + PROFILE_INCR(profile_task_yield,1); + if (qthread_shep() == NO_SHEPHERD) { + sched_yield(); + } else { + qthread_yield(); + } +} + +// Sync variables +void chpl_sync_lock(chpl_sync_aux_t *s) +{ + aligned_t l; + chpl_bool uncontested_lock = true; + + PROFILE_INCR(profile_sync_lock, 1); + + // + // To prevent starvation due to never switching away from a task that is + // spinning while doing readXX() on a sync variable, yield if this sync var + // has a "lot" of uncontested locks. Note that the uncontested locks do not + // have to be consecutive. Also note that the number of uncontested locks + // is a lossy counter. Currently a "lot" is defined as ~100 uncontested + // locks, with care taken to not yield on the first uncontested lock. + // + // If real qthreads sync vars were used, it's possible this wouldn't be + // needed. + // + + l = qthread_incr(&s->lockers_in, 1); + + while (l != s->lockers_out) { + uncontested_lock = false; + qthread_yield(); + } + + if (uncontested_lock) { + if ((++s->uncontested_locks & 0x5F) == 0) { + qthread_yield(); + } + } +} + +void chpl_sync_unlock(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_unlock, 1); + + qthread_incr(&s->lockers_out, 1); +} + +static inline void about_to_block(int32_t lineno, + c_string filename) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + assert(data); + + data->lock_lineno = lineno; + data->lock_filename = filename; +} + +void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s, + int32_t lineno, + c_string filename) +{ + PROFILE_INCR(profile_sync_waitFullAndLock, 1); + + if (blockreport) { about_to_block(lineno, filename); } + chpl_sync_lock(s); + while (s->is_full == 0) { + chpl_sync_unlock(s); + qthread_syncvar_readFE(NULL, &(s->signal_full)); + chpl_sync_lock(s); + } +} + +void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s, + int32_t lineno, + c_string filename) +{ + PROFILE_INCR(profile_sync_waitEmptyAndLock, 1); + + if (blockreport) { about_to_block(lineno, filename); } + chpl_sync_lock(s); + while (s->is_full != 0) { + chpl_sync_unlock(s); + qthread_syncvar_readFE(NULL, &(s->signal_empty)); + chpl_sync_lock(s); + } +} + +void chpl_sync_markAndSignalFull(chpl_sync_aux_t *s) // and unlock +{ + PROFILE_INCR(profile_sync_markAndSignalFull, 1); + + qthread_syncvar_fill(&(s->signal_full)); + s->is_full = 1; + chpl_sync_unlock(s); +} + +void chpl_sync_markAndSignalEmpty(chpl_sync_aux_t *s) // and unlock +{ + PROFILE_INCR(profile_sync_markAndSignalEmpty, 1); + + qthread_syncvar_fill(&(s->signal_empty)); + s->is_full = 0; + chpl_sync_unlock(s); +} + +chpl_bool chpl_sync_isFull(void *val_ptr, + chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_isFull, 1); + + return s->is_full; +} + +void chpl_sync_initAux(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_initAux, 1); + + s->lockers_in = 0; + s->lockers_out = 0; + s->is_full = 0; + s->signal_empty = SYNCVAR_EMPTY_INITIALIZER; + s->signal_full = SYNCVAR_EMPTY_INITIALIZER; +} + +void chpl_sync_destroyAux(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_destroyAux, 1); +} + +static void chapel_display_thread(qt_key_t addr, + qthread_f f, + void *arg, + void *retloc, + unsigned int thread_id, + void *tls, + void *callarg) +{ + chpl_qthread_tls_t *rep = (chpl_qthread_tls_t *)tls; + + if (rep) { + if ((rep->lock_lineno > 0) && rep->lock_filename) { + fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", rep->lock_filename, rep->lock_lineno, rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + } else if (rep->lock_lineno == 0 && rep->lock_filename) { + fprintf(stderr, "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", rep->lock_filename, rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + } else if (rep->lock_lineno == 0) { + fprintf(stderr, "Waiting for dependencies (uninitialized task %s:%zu)\n", rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + } + fflush(stderr); + } +} + +static void report_locked_threads(void) +{ + qthread_feb_callback(chapel_display_thread, NULL); + qthread_syncvar_callback(chapel_display_thread, NULL); +} + +static void SIGINT_handler(int sig) +{ + signal(sig, SIG_IGN); + + if (blockreport) { + report_locked_threads(); + } + + if (taskreport) { + fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n"); + // report_all_tasks(); + } + + chpl_exit_any(1); +} + +// Tasks + +#ifndef QTHREAD_MULTINODE +static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER; +static volatile int done_finalizing = 0; + +static void *initializer(void *junk) +{ + qthread_initialize(); + MACHINE_FENCE; + chpl_qthread_done_initializing = 1; + + qthread_syncvar_readFF(NULL, &canexit); + + qthread_finalize(); + MACHINE_FENCE; + done_finalizing = 1; + return NULL; +} +#endif /* ! QTHREAD_MULTINODE */ + +// Helper function to set a qthreads env var. This is meant to mirror setenv +// functionality, but qthreads has two environment variables for every setting: +// a QT_ and a QTHREAD_ version. We often forget to think about both so this +// wraps the overriding logic. In verbose mode it prints out if we overrode +// values, or if we were prevented from setting values because they existed +// (and override was 0.) +static void chpl_qt_setenv(char* var, char* val, int32_t override) { + char qt_env[QT_ENV_S] = { 0 }; + char qthread_env[QT_ENV_S] = { 0 }; + char *qt_val; + char *qthread_val; + chpl_bool eitherSet = false; + + strncpy(qt_env, "QT_", sizeof(qt_env)); + strncat(qt_env, var, sizeof(qt_env) - 1); + + strncpy(qthread_env, "QTHREAD_", sizeof(qthread_env)); + strncat(qthread_env, var, sizeof(qthread_env) - 1); + + qt_val = getenv(qt_env); + qthread_val = getenv(qthread_env); + eitherSet = (qt_val != NULL || qthread_val != NULL); + + if (override || !eitherSet) { + if (verbosity >= 2 && override && eitherSet) { + printf("QTHREADS: Overriding the value of %s and %s " + "with %s\n", qt_env, qthread_env, val); + } + (void) setenv(qt_env, val, 1); + (void) setenv(qthread_env, val, 1); + } else if (verbosity >= 2) { + char* set_env = NULL; + char* set_val = NULL; + if (qt_val != NULL) { + set_env = qt_env; + set_val = qt_val; + } else { + set_env = qthread_env; + set_val = qthread_val; + } + printf("QTHREADS: Not setting %s to %s because %s is set to %s and " + "overriding was not requested\n", qt_env, val, set_env, set_val); + } +} + +// Determine the number of workers based on environment settings. If a user set +// HWPAR, they are saying they want to use HWPAR many workers, but let the +// runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or +// NUM_WORKERS_PER_SHEPHERD then they must have specific reasons for doing so. +// Returns 0 if no Qthreads env vars related to the number of threads were set, +// what HWPAR was set to if it was set, or -1 if NUM_SHEP and/or NUM_WPS were +// set since we can't figure out before Qthreads init what this will actually +// turn into without duplicating Qthreads logic (-1 is a sentinel for don't +// adjust the values, and give them as is to Qthreads.) +static int32_t chpl_qt_getenv_num_workers() { + int32_t hwpar; + int32_t num_wps; + int32_t num_sheps; + + hwpar = qt_internal_get_env_num("HWPAR", 0, 0); + num_wps = qt_internal_get_env_num("NUM_WORKERS_PER_SHEPHERD", 0, 0); + num_sheps = qt_internal_get_env_num("NUM_SHEPHERDS", 0, 0); + + if (hwpar) { + return hwpar; + } else if (num_wps || num_sheps) { + return -1; + } + + return 0; +} + + +// Sets up and returns the amount of hardware parallelism to use, limited to +// maxThreads. Returns -1 if we did not setup parallelism because a user +// explicitly requested a specific layout from qthreads. +static int32_t setupAvailableParallelism(int32_t maxThreads) { + int32_t numThreadsPerLocale; + int32_t qtEnvThreads; + int32_t hwpar; + char newenv_workers[QT_ENV_S] = { 0 }; + + // Experience has shown that Qthreads generally performs best with + // num_workers = numCores (and thus worker_unit = core) but if the user has + // explicitly requested more threads through the chapel or Qthread env + // vars, we override the default. + numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); + qtEnvThreads = chpl_qt_getenv_num_workers(); + hwpar = 0; + + // User set chapel level env var (CHPL_RT_NUM_THREADS_PER_LOCALE) + // This is limited to the number of logical CPUs on the node. + if (numThreadsPerLocale != 0) { + int32_t numPUsPerLocale; + + hwpar = numThreadsPerLocale; + + numPUsPerLocale = chpl_getNumLogicalCpus(true); + if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) { + if (verbosity >= 2) { + printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d " + "to prevent oversubscription of the system.\n", + hwpar, numPUsPerLocale); + } + + // Do not oversubscribe the system, use all available resources. + hwpar = numPUsPerLocale; + } + } + // User set qthreads level env var + // (HWPAR or (NUM_SHEPHERDS and NUM_WORKERS_PER_SHEPHERD)) + else if (qtEnvThreads != 0) { + hwpar = qtEnvThreads; + } + // User did not set chapel or qthreads vars -- our default + else { + hwpar = chpl_getNumPhysicalCpus(true); + } + + // hwpar will only be <= 0 if the user set QT_NUM_SHEPHERDS and/or + // QT_NUM_WORKERS_PER_SHEPHERD in which case we assume as "expert" user and + // don't impose any thread limits or set worker_unit. + if (hwpar > 0) { + // Limit the parallelism to the maximum imposed by the comm layer. + if (0 < maxThreads && maxThreads < hwpar) { + hwpar = maxThreads; + } + + // If there is more parallelism requested than the number of cores, set the + // worker unit to pu, otherwise core. + if (hwpar > chpl_getNumPhysicalCpus(true)) { + chpl_qt_setenv("WORKER_UNIT", "pu", 0); + } else { + chpl_qt_setenv("WORKER_UNIT", "core", 0); + } + + // Unset relevant Qthreads environment variables. + qt_internal_unset_envstr("HWPAR"); + qt_internal_unset_envstr("NUM_SHEPHERDS"); + qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); + + snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar); + if (THREADQUEUE_POLICY_TRUE == qt_threadqueue_policy(SINGLE_WORKER)) { + chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1); + chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1); + } else { + chpl_qt_setenv("HWPAR", newenv_workers, 1); + } + } + return hwpar; +} + +static void setupCallStacks(int32_t hwpar) { + size_t callStackSize; + + // If the user compiled with no stack checks (either explicitly or + // implicitly) turn off qthread guard pages. TODO there should also be a + // chpl level env var backing this at runtime (can be the same var.) + // Also turn off guard pages if the heap page size isn't the same as + // the system page size, because when that's the case we can reliably + // make the guard pages un-referenceable. (This typically arises when + // the heap is on hugepages, as is often the case on Cray systems.) + // + // Note that we won't override an explicit setting of QT_GUARD_PAGES + // in the former case, but we do in the latter case. + if (CHPL_STACK_CHECKS == 0) { + chpl_qt_setenv("GUARD_PAGES", "false", 0); + } + else if (chpl_getHeapPageSize() != chpl_getSysPageSize()) { + chpl_qt_setenv("GUARD_PAGES", "false", 1); + } + + // Precedence (high-to-low): + // 1) Chapel environment (CHPL_RT_CALL_STACK_SIZE) + // 2) QTHREAD_STACK_SIZE + // 3) Chapel default + if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 || + (qt_internal_get_env_num("STACK_SIZE", 0, 0) == 0 && + (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) { + char newenv_stack[QT_ENV_S]; + snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize); + chpl_qt_setenv("STACK_SIZE", newenv_stack, 1); + + // Qthreads sets up memory pools expecting the item_size to be small. + // Stacks are allocated in this manner too, but our default stack size + // is quite large, so we limit the max memory allocated for a pool. We + // default to a multiple of callStackSize and hwpar, with the thought + // that available memory is generally proportional to the amount of + // parallelism. For some architectures, this isn't true so we set a max + // upper bound. And if the callStackSize is small, we don't want to + // limit all qthreads pool allocations to a small value, so we have a + // lower bound as well. Note that qthread stacks are slightly larger + // than specified to store a book keeping structure and possibly guard + // pages, so we thrown an extra MB. + if (hwpar > 0) { + const size_t oneMB = 1024 * 1024; + const size_t allocSizeLowerBound = 33 * oneMB; + const size_t allocSizeUpperBound = 513 * oneMB; + size_t maxPoolAllocSize; + char newenv_alloc[QT_ENV_S]; + + maxPoolAllocSize = 2 * hwpar * callStackSize + oneMB; + if (maxPoolAllocSize < allocSizeLowerBound) { + maxPoolAllocSize = allocSizeLowerBound; + } else if (maxPoolAllocSize > allocSizeUpperBound) { + maxPoolAllocSize = allocSizeUpperBound; + } + snprintf(newenv_alloc, sizeof(newenv_alloc), "%zu", maxPoolAllocSize); + chpl_qt_setenv("MAX_POOL_ALLOC_SIZE", newenv_alloc, 0); + } + } +} + +void chpl_task_init(void) +{ + int32_t commMaxThreads; + int32_t hwpar; + pthread_t initer; + + chpl_qthread_process_pthread = pthread_self(); + chpl_qthread_process_tls.chpl_data.id = qthread_incr(&next_task_id, 1); + + commMaxThreads = chpl_comm_getMaxThreads(); + + // Setup hardware parallelism, the stack size, and stack guards + hwpar = setupAvailableParallelism(commMaxThreads); + setupCallStacks(hwpar); + + if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); } + + // Initialize qthreads + pthread_create(&initer, NULL, initializer, NULL); + while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); + + // Now that Qthreads is up and running, do a sanity check and make sure + // that the number of workers is less than any comm layer limit. This is + // mainly need for the case where a user set QT_NUM_SHEPHERDS and/or + // QT_NUM_WORKERS_PER_SHEPHERD in which case we don't impose any limits on + // the number of threads qthreads creates beforehand + assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads); + + if (blockreport || taskreport) { + if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { + perror("Could not register SIGINT handler"); + } + } +} + +void chpl_task_exit(void) +{ +#ifdef CHAPEL_PROFILE + profile_print(); +#endif /* CHAPEL_PROFILE */ + +#ifdef QTHREAD_MULTINODE +#else + if (qthread_shep() == NO_SHEPHERD) { + /* sometimes, tasking is told to shutdown even though it hasn't been + * told to start yet */ + if (chpl_qthread_done_initializing == 1) { + qthread_syncvar_fill(&canexit); + while (done_finalizing == 0) SPINLOCK_BODY(); + } + } else { + qthread_syncvar_fill(&exit_ret); + } +#endif /* QTHREAD_MULTINODE */ +} + +static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, + chpl_task_prvDataImpl_t *chpl_data) { + if (chpl_task_have_callbacks(event_kind)) { + if (chpl_data->id == chpl_nullTaskID) + chpl_data->id = qthread_incr(&next_task_id, 1); + chpl_task_do_callbacks(event_kind, + chpl_data->task_filename, + chpl_data->task_lineno, + chpl_data->id, + chpl_data->is_executeOn); + } +} + +static aligned_t chapel_wrapper(void *arg) +{ + chpl_qthread_wrapper_args_t *rarg = arg; + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + + data->chpl_data = rarg->chpl_data; + data->lock_filename = NULL; + data->lock_lineno = 0; + + if (rarg->countRunning) { + chpl_taskRunningCntInc(0, NULL); + } + + wrap_callbacks(chpl_task_cb_event_kind_begin, &data->chpl_data); + + (*(chpl_fn_p)(rarg->fn))(rarg->args); + + wrap_callbacks(chpl_task_cb_event_kind_end, &data->chpl_data); + + if (rarg->countRunning) { + chpl_taskRunningCntDec(0, NULL); + } + + return 0; +} + +typedef struct { + chpl_fn_p fn; + void *arg; +} comm_task_wrapper_info_t; + +static void *comm_task_wrapper(void *arg) +{ + comm_task_wrapper_info_t *rarg = arg; + chpl_moveToLastCPU(); + (*(chpl_fn_p)(rarg->fn))(rarg->arg); + return 0; +} + +// Start the main task. +// +// Warning: this method is not called within a Qthread task context. Do +// not use methods that require task context (e.g., task-local storage). +void chpl_task_callMain(void (*chpl_main)(void)) +{ + chpl_qthread_wrapper_args_t wrapper_args = + {chpl_main, NULL, false, + PRV_DATA_IMPL_VAL("
", 0, + chpl_qthread_process_tls.chpl_data.id, false, + c_sublocid_any_val, false) }; + + qthread_debug(CHAPEL_CALLS, "[%d] begin chpl_task_callMain()\n", chpl_nodeID); + +#ifdef QTHREAD_MULTINODE + qthread_debug(CHAPEL_BEHAVIOR, "[%d] calling spr_unify\n", chpl_nodeID); + int const rc = spr_unify(); + assert(SPR_OK == rc); +#endif /* QTHREAD_MULTINODE */ + + wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data); + + qthread_fork_syncvar(chapel_wrapper, &wrapper_args, &exit_ret); + qthread_syncvar_readFF(NULL, &exit_ret); + + qthread_debug(CHAPEL_BEHAVIOR, "[%d] main task finished\n", chpl_nodeID); + qthread_debug(CHAPEL_CALLS, "[%d] end chpl_task_callMain()\n", chpl_nodeID); +} + +void chpl_task_stdModulesInitialized(void) +{ + // + // It's not safe to call the module code to count the main task as + // running until after the modules have been initialized. That's + // when this function is called, so now count the main task. + // + canCountRunningTasks = true; + chpl_taskRunningCntInc(0, NULL); +} + +int chpl_task_createCommTask(chpl_fn_p fn, + void *arg) +{ +#ifndef QTHREAD_MULTINODE + // + // The wrapper info must be static because it won't be referred to + // until the new pthread calls comm_task_wrapper(). And, it is + // safe for it to be static because we will be called at most once + // on each node. + // + static + comm_task_wrapper_info_t wrapper_info; + wrapper_info.fn = fn; + wrapper_info.arg = arg; + return pthread_create(&chpl_qthread_comm_pthread, + NULL, comm_task_wrapper, &wrapper_info); +#else + return 0; +#endif +} + +void chpl_task_addToTaskList(chpl_fn_int_t fid, + void *arg, + c_sublocid_t subloc, + chpl_task_list_p *task_list, + int32_t task_list_locale, + chpl_bool is_begin_stmt, + int lineno, + c_string filename) +{ + chpl_bool serial_state = chpl_task_getSerial(); + + assert(subloc != c_sublocid_none); + + PROFILE_INCR(profile_task_addToTaskList,1); + + if (serial_state) { + // call the function directly. + (chpl_ftable[fid])(arg); + } else { + chpl_qthread_wrapper_args_t wrapper_args = + {chpl_ftable[fid], arg, false, + PRV_DATA_IMPL_VAL(filename, lineno, chpl_nullTaskID, false, + subloc, serial_state) }; + + wrap_callbacks(chpl_task_cb_event_kind_create, + &wrapper_args.chpl_data); + + if (subloc == c_sublocid_any) { + qthread_fork_copyargs(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL, + (qthread_shepherd_id_t) subloc); + } + } +} + +void chpl_task_processTaskList(chpl_task_list_p task_list) +{ + PROFILE_INCR(profile_task_processTaskList,1); +} + +void chpl_task_executeTasksInList(chpl_task_list_p task_list) +{ + PROFILE_INCR(profile_task_executeTasksInList,1); +} + +void chpl_task_freeTaskList(chpl_task_list_p task_list) +{ + PROFILE_INCR(profile_task_freeTaskList,1); +} + +void chpl_task_startMovedTask(chpl_fn_p fp, + void *arg, + c_sublocid_t subloc, + chpl_taskID_t id, + chpl_bool serial_state) +{ + assert(subloc != c_sublocid_none); + assert(id == chpl_nullTaskID); + + chpl_qthread_wrapper_args_t wrapper_args = + {fp, arg, canCountRunningTasks, + PRV_DATA_IMPL_VAL("", 0, chpl_nullTaskID, true, + subloc, serial_state) }; + + PROFILE_INCR(profile_task_startMovedTask,1); + + wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data); + + if (subloc == c_sublocid_any) { + qthread_fork_copyargs(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL, + (qthread_shepherd_id_t) subloc); + } +} + +// +// chpl_task_getSubloc() is in tasks-qthreads.h +// + +// +// chpl_task_setSubloc() is in tasks-qthreads.h +// + +// +// chpl_task_getRequestedSubloc() is in tasks-qthreads.h +// + + +// Returns '(unsigned int)-1' if called outside of the tasking layer. +chpl_taskID_t chpl_task_getId(void) +{ + chpl_qthread_tls_t * tls = chpl_qthread_get_tasklocal(); + + PROFILE_INCR(profile_task_getId,1); + + if (tls == NULL) + return (chpl_taskID_t) -1; + + if (tls->chpl_data.id == chpl_nullTaskID) + tls->chpl_data.id = qthread_incr(&next_task_id, 1); + + return tls->chpl_data.id; +} + +void chpl_task_sleep(int secs) +{ + if (qthread_shep() == NO_SHEPHERD) { + sleep(secs); + } else { + qtimer_t t = qtimer_create(); + qtimer_start(t); + do { + qthread_yield(); + qtimer_stop(t); + } while (qtimer_secs(t) < secs); + } +} + +/* The get- and setSerial() methods assume the beginning of the task-local + * data segment holds a chpl_bool denoting the serial state. */ +chpl_bool chpl_task_getSerial(void) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + + PROFILE_INCR(profile_task_getSerial,1); + + return data->chpl_data.prvdata.serial_state; +} + +void chpl_task_setSerial(chpl_bool state) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + data->chpl_data.prvdata.serial_state = state; + + PROFILE_INCR(profile_task_setSerial,1); +} + +uint32_t chpl_task_getMaxPar(void) { + // + // We assume here that the caller (in the LocaleModel module code) + // is interested in the number of workers on the whole node, and + // will decide itself how much parallelism to create across and + // within sublocales, if there are any. + // + return (uint32_t) qthread_num_workers(); +} + +c_sublocid_t chpl_task_getNumSublocales(void) +{ + // FIXME: What we really want here is the number of NUMA + // sublocales we are supporting. For now we use the number of + // shepherds as a proxy for that. + return (c_sublocid_t) qthread_num_shepherds(); +} + +size_t chpl_task_getCallStackSize(void) +{ + PROFILE_INCR(profile_task_getCallStackSize,1); + + return qthread_readstate(STACK_SIZE); +} + +// XXX: Should probably reflect all shepherds +uint32_t chpl_task_getNumQueuedTasks(void) +{ + return qthread_readstate(NODE_BUSYNESS); +} + +uint32_t chpl_task_getNumRunningTasks(void) +{ + chpl_internal_error("chpl_task_getNumRunningTasks() called"); + return 1; +} + +int32_t chpl_task_getNumBlockedTasks(void) +{ + // This isn't accurate, but in the absence of better information + // it's the best we can do. + return 0; +} + +// Threads + +uint32_t chpl_task_getNumThreads(void) +{ + return (uint32_t)qthread_num_workers(); +} + +// Ew. Talk about excessive bookkeeping. +uint32_t chpl_task_getNumIdleThreads(void) +{ + return 0; +} + +/* vim:set expandtab: */ diff --git a/util/printchplenv b/util/printchplenv index e66b1d2694..5ad79dff86 100755 --- a/util/printchplenv +++ b/util/printchplenv @@ -175,6 +175,8 @@ def print_mode(mode='list', anonymize=False): mode, filters=('runtime',)) if tasks == 'qthreads': link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args()) + if tasks == 'atmi': + link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args()) print_var(' CHPL_RE2_UNIQ_CFG_PATH', chpl_3p_re2_configs.get_uniq_cfg_path(), diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash index 3625a4657f..0dee283cc9 100644 --- a/util/setchplenv_hsa.bash +++ b/util/setchplenv_hsa.bash @@ -61,8 +61,8 @@ export CHPL_COMM=none echo " to none" echo -n "Setting CHPL_TASKS" -export CHPL_TASKS=qthreads -echo " to qthreads" +export CHPL_TASKS=atmi +echo " to atmi" echo -n "Setting CHPL_ATOMICS" export CHPL_ATOMICS=intrinsics @@ -92,6 +92,10 @@ echo -n "Disabling NUMA" export CHPL_HWLOC_CFG_OPTIONS=" --disable-libnuma" echo " done" +echo -n "Setting CHPL_HWLOC" +export CHPL_HWLOC=hwloc +echo " to hwloc" + echo -n "Disabling LLVM support" export CHPL_LLVM=none echo " done"