diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b8a60a1..814e6357 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ # CMake build for Cheetah. -cmake_minimum_required(VERSION 3.4.3) +cmake_minimum_required(VERSION 3.9) if(POLICY CMP0068) cmake_policy(SET CMP0068 NEW) @@ -16,11 +16,11 @@ set(CMAKE_MODULE_PATH # Check if cheetah is built as a standalone project. if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR CHEETAH_STANDALONE_BUILD) - project(cheetah CXX C) + project(Cheetah CXX C) set_property(GLOBAL PROPERTY USE_FOLDERS ON) - set(PACKAGE_NAME cheetah) - set(PACKAGE_VERSION 9.0.1) + set(PACKAGE_NAME Cheetah) + set(PACKAGE_VERSION 10.0.1) set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}") set(PACKAGE_BUGREPORT "bugs@opencilk.org") @@ -56,6 +56,7 @@ set(CHEETAH_SYSROOT "" CACHE PATH "Sysroot for cross compiling.") option(CHEETAH_ENABLE_SHARED "Build cheetah as a shared library." ON) option(CHEETAH_ENABLE_STATIC "Build cheetah as a static library." ON) +option(CHEETAH_ENABLE_BITCODE_ABI "Build cheetah ABI as LLVM bitcode." ON) cmake_dependent_option(CHEETAH_INSTALL_STATIC_LIBRARY "Install the static cheetah library." ON @@ -63,6 +64,9 @@ cmake_dependent_option(CHEETAH_INSTALL_STATIC_LIBRARY cmake_dependent_option(CHEETAH_INSTALL_SHARED_LIBRARY "Install the shared cheetah library." ON "CHEETAH_ENABLE_SHARED;CHEETAH_INSTALL_LIBRARY" OFF) +cmake_dependent_option(CHEETAH_INSTALL_BITCODE_ABI + "Install the cheetah ABI LLVM bitcode." ON + "CHEETAH_ENABLE_BITCODE_ABI;CHEETAH_INSTALL_LIBRARY" OFF) set(CHEETAH_ABI_VERSION "1" CACHE STRING "ABI version of cheetah. Defaults to 1.") @@ -71,9 +75,10 @@ if (NOT CHEETAH_ENABLE_SHARED AND NOT CHEETAH_ENABLE_STATIC) endif() # Target options -------------------------------------------------------------- -set(CHEETAH_SYSROOT "" CACHE STRING "Use alternate sysroot.") -set(CHEETAH_GCC_TOOLCHAIN "" CACHE STRING "Use alternate GCC toolchain.") -set(CHEETAH_MIN_OSX_VERSION 10.9) + +# Default minimum OSX version to support, if +# CMAKE_OSX_DEPLOYMENT_TARGET is not specified +set(CHEETAH_MIN_OSX_VERSION 10.14) #=============================================================================== # Configure System @@ -126,13 +131,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CHEETAH_LIBRARY_DIR}) set(CHEETAH_C_FLAGS "") set(CHEETAH_CXX_FLAGS "") set(CHEETAH_COMPILE_FLAGS "") -if (APPLE) - list(APPEND CHEETAH_COMPILE_FLAGS -mmacosx-version-min=${CHEETAH_MIN_OSX_VERSION}) -endif() set(CHEETAH_COMPILE_DEFS "") set(CHEETAH_LINK_FLAGS "") set(CHEETAH_LIBRARIES "") +if (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET) + list(APPEND CHEETAH_COMPILE_FLAGS -mmacosx-version-min=${CHEETAH_MIN_OSX_VERSION}) + list(APPEND CHEETAH_LINK_FLAGS -mmacosx-version-min=${CHEETAH_MIN_OSX_VERSION}) +endif() + # Include macros for adding and removing cheetah flags. include(HandleCheetahFlags) @@ -164,6 +171,10 @@ endif() # Configure compiler. include(config-ix) +if (APPLE AND CHEETAH_HAS_APP_EXTENSION) + list(APPEND CHEETAH_LINK_FLAGS "-fapplication-extension") +endif() + if (CHEETAH_USE_COMPILER_RT) list(APPEND CHEETAH_LINK_FLAGS "--rtlib=compiler-rt") endif() diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index aaa7039a..9c1a5714 100644 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -2,6 +2,13 @@ include(CheckLibraryExists) include(CheckCCompilerFlag) include(CheckCXXCompilerFlag) +function(check_linker_flag flag out_var) + cmake_push_check_state() + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${flag}") + check_cxx_compiler_flag("" ${out_var}) + cmake_pop_check_state() +endfunction() + check_library_exists(c fopen "" CHEETAH_HAS_C_LIB) if (NOT CHEETAH_USE_COMPILER_RT) check_library_exists(gcc_s __gcc_personality_v0 "" CHEETAH_HAS_GCC_S_LIB) @@ -14,3 +21,9 @@ check_library_exists(rt clock_gettime "" CHEETAH_HAS_RT_LIB) # Check compiler flags check_c_compiler_flag(-fomit-frame-pointer CHEETAH_HAS_FOMIT_FRAME_POINTER_FLAG) +check_c_compiler_flag(-mavx -Werror CHEETAH_HAS_MAVX_FLAG) +check_c_compiler_flag(-march=sandybridge -Werror CHEETAH_HAS_MARCH_SANDYBRIDGE_FLAG) + +if (APPLE) + check_linker_flag("-fapplication-extension" CHEETAH_HAS_APP_EXTENSION) +endif() diff --git a/config.mk b/config.mk index 1c2d3cd5..65fded51 100644 --- a/config.mk +++ b/config.mk @@ -2,6 +2,7 @@ COMPILER_BASE= CC=$(COMPILER_BASE)clang CXX=$(COMPILER_BASE)clang++ LINK_CC=$(CC) +LLVM_LINK=$(COMPILER_BASE)llvm-link # ABI_DEF=-DOPENCILK_ABI # If use cheetah diff --git a/handcomp_test/Makefile b/handcomp_test/Makefile index ef86e5b6..18bf66cd 100644 --- a/handcomp_test/Makefile +++ b/handcomp_test/Makefile @@ -19,7 +19,7 @@ TIMING_COUNT := 1 all: $(TESTS) -$(TESTS): %: %.o ktiming.o getoptions.o +$(TESTS): %: %.o ktiming.o getoptions.o ZERO.o $(CC) $^ -o $@ $(RTS_LIBS) -lrt -lpthread -lm %.o: %.c @@ -28,18 +28,18 @@ $(TESTS): %: %.o ktiming.o getoptions.o memcheck: $(MAKE) clean; $(MAKE) > /dev/null date - valgrind ./fib --cheetah-nproc 8 26 - valgrind ./mm_dac --cheetah-nproc 8 -n 512 - valgrind ./cilksort --cheetah-nproc 8 -n 3000000 - valgrind ./nqueens --cheetah-nproc 8 10 + CILK_NWORKERS=8 valgrind ./fib 26 + CILK_NWORKERS=8 valgrind ./mm_dac -n 512 + CILK_NWORKERS=8 valgrind ./cilksort -n 3000000 + CILK_NWORKERS=8 valgrind ./nqueens 10 date check: $(MAKE) clean; $(MAKE) TIMING_COUNT=5 > /dev/null - ./fib --cheetah-nproc $(MANYPROC) 40 - ./mm_dac --cheetah-nproc $(MANYPROC) -n 1024 -c - ./cilksort --cheetah-nproc $(MANYPROC) -n 30000000 -c - ./nqueens --cheetah-nproc $(MANYPROC) 14 + CILK_NWORKERS=$(MANYPROC) ./fib 40 + CILK_NWORKERS=$(MANYPROC) ./mm_dac -n 1024 -c + CILK_NWORKERS=$(MANYPROC) ./cilksort -n 30000000 -c + CILK_NWORKERS=$(MANYPROC) ./nqueens 14 clean: rm -f *.o *~ $(TESTS) core.* diff --git a/handcomp_test/cilksort.c b/handcomp_test/cilksort.c index 4b1e7fb9..8eab0774 100644 --- a/handcomp_test/cilksort.c +++ b/handcomp_test/cilksort.c @@ -59,10 +59,12 @@ #include #include "../runtime/cilk2c.h" +#include "../runtime/cilk2c_inlined.c" #include "ktiming.h" #include "getoptions.h" extern size_t ZERO; +void __attribute__((weak)) dummy(void *p) { return; } #ifndef TIMING_COUNT #define TIMING_COUNT 0 @@ -346,7 +348,7 @@ void cilkmerge(ELM *low1, ELM *high1, return; } - alloca(ZERO); + dummy(alloca(ZERO)); __cilkrts_stack_frame sf; __cilkrts_enter_frame(&sf); @@ -383,7 +385,8 @@ void cilkmerge(ELM *low1, ELM *high1, } __cilkrts_pop_frame(&sf); - __cilkrts_leave_frame(&sf); + if (0 != sf.flags) + __cilkrts_leave_frame(&sf); return; } @@ -421,7 +424,7 @@ void cilksort(ELM *low, ELM *tmp, long size) { return; } - alloca(ZERO); + dummy(alloca(ZERO)); __cilkrts_stack_frame sf; __cilkrts_enter_frame(&sf); @@ -479,7 +482,8 @@ void cilksort(ELM *low, ELM *tmp, long size) { cilkmerge(tmpA, tmpC - 1, tmpC, tmpA + size - 1, A); __cilkrts_pop_frame(&sf); - __cilkrts_leave_frame(&sf); + if (0 != sf.flags) + __cilkrts_leave_frame(&sf); return; } @@ -527,7 +531,7 @@ int usage(void) { const char *specifiers[] = {"-n", "-c", "-h", 0}; int opt_types[] = {LONGARG, BOOLARG, BOOLARG, 0}; -int cilk_main(int argc, char **argv) { +int main(int argc, char **argv) { long size; ELM *array, *tmp; @@ -553,7 +557,7 @@ int cilk_main(int argc, char **argv) { begin = ktiming_getmark(); cilksort(array, tmp, size); end = ktiming_getmark(); - elapsed[i] = ktiming_diff_usec(&begin, &end); + elapsed[i] = ktiming_diff_nsec(&begin, &end); } print_runtime(elapsed, TIMING_COUNT); diff --git a/handcomp_test/fib.c b/handcomp_test/fib.c index 0519cf03..6c2283b3 100644 --- a/handcomp_test/fib.c +++ b/handcomp_test/fib.c @@ -2,6 +2,7 @@ #include #include "../runtime/cilk2c.h" +#include "../runtime/cilk2c_inlined.c" #include "ktiming.h" @@ -31,6 +32,7 @@ int fib(int n) { */ extern size_t ZERO; +void __attribute__((weak)) dummy(void *p) { return; } static void __attribute__ ((noinline)) fib_spawn_helper(int *x, int n); @@ -40,7 +42,7 @@ int fib(int n) { if(n < 2) return n; - alloca(ZERO); + dummy(alloca(ZERO)); __cilkrts_stack_frame sf; __cilkrts_enter_frame(&sf); @@ -62,7 +64,8 @@ int fib(int n) { _tmp = x + y; __cilkrts_pop_frame(&sf); - __cilkrts_leave_frame(&sf); + if (0 != sf.flags) + __cilkrts_leave_frame(&sf); return _tmp; } @@ -77,7 +80,7 @@ static void __attribute__ ((noinline)) fib_spawn_helper(int *x, int n) { __cilkrts_leave_frame(&sf); } -int cilk_main(int argc, char * args[]) { +int main(int argc, char * args[]) { int i; int n, res; clockmark_t begin, end; @@ -94,7 +97,7 @@ int cilk_main(int argc, char * args[]) { begin = ktiming_getmark(); res = fib(n); end = ktiming_getmark(); - running_time[i] = ktiming_diff_usec(&begin, &end); + running_time[i] = ktiming_diff_nsec(&begin, &end); } printf("Result: %d\n", res); print_runtime(running_time, TIMING_COUNT); diff --git a/handcomp_test/ktiming.c b/handcomp_test/ktiming.c index e565ffa7..830dcf22 100644 --- a/handcomp_test/ktiming.c +++ b/handcomp_test/ktiming.c @@ -1,16 +1,16 @@ /** * Copyright (c) 2012 MIT License by 6.172 Staff - * + * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -31,18 +31,18 @@ #include "./ktiming.h" +#include #include #include #include -#define USEC_TO_SEC(x) ((double)x*1.0e-9) - +#define NSEC_TO_SEC(x) ((double)(x)*1.0e-9) clockmark_t ktiming_getmark(void) { - struct timespec temp; + struct timespec temp = {0, 0}; uint64_t nanos; - int stat = clock_gettime(CLOCK_MONOTONIC , &temp); + int stat = clock_gettime(CLOCK_MONOTONIC, &temp); if (stat != 0) { perror("ktiming_getmark()"); exit(-1); @@ -52,43 +52,44 @@ clockmark_t ktiming_getmark(void) { return nanos; } -uint64_t ktiming_diff_usec(const clockmark_t* const - start, const clockmark_t* const end) { +uint64_t ktiming_diff_nsec(const clockmark_t *const start, + const clockmark_t *const end) { return *end - *start; } -double ktiming_diff_sec(const clockmark_t* const start, - const clockmark_t* const end) { - return ((double)ktiming_diff_usec(start, end)) / 1000000000.0f; +double ktiming_diff_sec(const clockmark_t *const start, + const clockmark_t *const end) { + return NSEC_TO_SEC(ktiming_diff_nsec(start, end)); } -static void -print_runtime_helper(uint64_t *usec_elapsed, int size, int summary) { +static void print_runtime_helper(uint64_t *nsec_elapsed, int size, + int summary) { - int i; + int i; uint64_t total = 0; double ave, std_dev = 0, dev_sq_sum = 0; for (i = 0; i < size; i++) { - total += usec_elapsed[i]; - if(!summary) { - printf("Running time %d: %gs\n", (i + 1), USEC_TO_SEC(usec_elapsed[i])); + total += nsec_elapsed[i]; + if (!summary) { + printf("Running time %d: %gs\n", (i + 1), + NSEC_TO_SEC(nsec_elapsed[i])); } } ave = total / size; - - if( size > 1 ) { + + if (size > 1) { for (i = 0; i < size; i++) { - dev_sq_sum += ( (ave - (double)usec_elapsed[i]) * - (ave - (double)usec_elapsed[i]) ); + double diff = (double)nsec_elapsed[i] - ave; + dev_sq_sum += diff * diff; } - std_dev = dev_sq_sum / (size-1); + std_dev = sqrt(dev_sq_sum / (size - 1)); } - printf("Running time average: %g s\n", USEC_TO_SEC(ave)); - if( std_dev != 0 ) { - printf( "Std. dev: %g s (%2.3f%%)\n", - USEC_TO_SEC(std_dev), 100.0*USEC_TO_SEC(std_dev/ave) ); + printf("Running time average: %g s\n", NSEC_TO_SEC(ave)); + if (std_dev != 0) { + printf("Std. dev: %g s (%2.3f%%)\n", NSEC_TO_SEC(std_dev), + 100.0 * std_dev / ave); } } @@ -99,4 +100,3 @@ void print_runtime(uint64_t *tm_elapsed, int size) { void print_runtime_summary(uint64_t *tm_elapsed, int size) { print_runtime_helper(tm_elapsed, size, 1); } - diff --git a/handcomp_test/ktiming.h b/handcomp_test/ktiming.h index 81089161..b5249f70 100644 --- a/handcomp_test/ktiming.h +++ b/handcomp_test/ktiming.h @@ -28,7 +28,7 @@ typedef uint64_t clockmark_t; uint64_t -ktiming_diff_usec(const clockmark_t* const start, const clockmark_t* const end); +ktiming_diff_nsec(const clockmark_t* const start, const clockmark_t* const end); double ktiming_diff_sec(const clockmark_t* const start, const clockmark_t* const end); clockmark_t ktiming_getmark(void); diff --git a/handcomp_test/mm_dac.c b/handcomp_test/mm_dac.c index aab5557c..40f10cb7 100644 --- a/handcomp_test/mm_dac.c +++ b/handcomp_test/mm_dac.c @@ -2,6 +2,7 @@ #include #include "../runtime/cilk2c.h" +#include "../runtime/cilk2c_inlined.c" #include "ktiming.h" #include "getoptions.h" @@ -15,6 +16,7 @@ #define FALSE 0 extern size_t ZERO; +void __attribute__((weak)) dummy(void *p) { return; } unsigned int randomSeed = 1; @@ -79,7 +81,7 @@ static void mm_dac(int *C, const int *A, const int *B, int n, int length) { return; } - alloca(ZERO); + dummy(alloca(ZERO)); __cilkrts_stack_frame sf; __cilkrts_enter_frame(&sf); @@ -156,7 +158,8 @@ static void mm_dac(int *C, const int *A, const int *B, int n, int length) { } __cilkrts_pop_frame(&sf); - __cilkrts_leave_frame(&sf); + if (0 != sf.flags) + __cilkrts_leave_frame(&sf); } __attribute__((noinline)) @@ -205,7 +208,7 @@ static void test_mm(int n, int check) { begin = ktiming_getmark(); mm_dac(C, A, B, n, n); end = ktiming_getmark(); - running_time[i] = ktiming_diff_usec(&begin, &end); + running_time[i] = ktiming_diff_nsec(&begin, &end); } print_runtime(running_time, TIMING_COUNT); @@ -236,7 +239,7 @@ static int is_power_of_2(int n) { const char *specifiers[] = {"-n", "-c", "-h", 0}; int opt_types[] = {LONGARG, BOOLARG, BOOLARG, 0}; -int cilk_main(int argc, char *argv[]) { +int main(int argc, char *argv[]) { long size; int help, check; diff --git a/handcomp_test/nqueens.c b/handcomp_test/nqueens.c index 37eb51e7..c84089af 100644 --- a/handcomp_test/nqueens.c +++ b/handcomp_test/nqueens.c @@ -7,9 +7,11 @@ #include #include "../runtime/cilk2c.h" +#include "../runtime/cilk2c_inlined.c" #include "ktiming.h" extern size_t ZERO; +void __attribute__((weak)) dummy(void *p) { return; } // int * count; @@ -66,7 +68,7 @@ static int nqueens(int n, int j, char *a) { count = (int *) alloca(n * sizeof(int)); (void) memset(count, 0, n * sizeof (int)); - alloca(ZERO); + dummy(alloca(ZERO)); __cilkrts_stack_frame sf; __cilkrts_enter_frame(&sf); @@ -104,7 +106,8 @@ static int nqueens(int n, int j, char *a) { } __cilkrts_pop_frame(&sf); - __cilkrts_leave_frame(&sf); + if (0 != sf.flags) + __cilkrts_leave_frame(&sf); return solNum; } @@ -120,7 +123,7 @@ nqueens_spawn_helper(int *count, int n, int j, char *a) { __cilkrts_leave_frame(&sf); } -int cilk_main(int argc, char *argv[]) { +int main(int argc, char *argv[]) { int n = 13; char *a; @@ -146,7 +149,7 @@ int cilk_main(int argc, char *argv[]) { begin = ktiming_getmark(); res = nqueens(n, 0, a); end = ktiming_getmark(); - elapsed[i] = ktiming_diff_usec(&begin, &end); + elapsed[i] = ktiming_diff_nsec(&begin, &end); } print_runtime(elapsed, TIMING_COUNT); #else diff --git a/include/cilk/cilk_api.h b/include/cilk/cilk_api.h index fa44d3b6..28cb2bf9 100644 --- a/include/cilk/cilk_api.h +++ b/include/cilk/cilk_api.h @@ -4,12 +4,25 @@ extern "C" { #endif +extern int __cilkrts_is_initialized(void); extern int __cilkrts_atinit(void (*callback)(void)); extern int __cilkrts_atexit(void (*callback)(void)); extern unsigned __cilkrts_get_nworkers(void); extern unsigned __cilkrts_get_worker_number(void) __attribute__((deprecated)); struct __cilkrts_worker *__cilkrts_get_tls_worker(void); + +#if defined(__cilk_pedigrees__) || defined(ENABLE_CILKRTS_PEDIGREE) +#include +typedef struct __cilkrts_pedigree { + uint64_t rank; + struct __cilkrts_pedigree *parent; +} __cilkrts_pedigree; +extern __cilkrts_pedigree __cilkrts_get_pedigree(void); +extern void __cilkrts_bump_worker_rank(void); +extern uint64_t __cilkrts_get_dprand(void); +#endif // defined(__cilk_pedigrees__) || defined(ENABLE_CILKRTS_PEDIGREE) + #undef VISIBILITY #ifdef __cplusplus diff --git a/include/cilk/holder.h b/include/cilk/holder.h index 9db0e302..80bd79da 100644 --- a/include/cilk/holder.h +++ b/include/cilk/holder.h @@ -542,7 +542,6 @@ namespace cilk { { // Called only by delete_self, which deleted the exemplar using an // allocator. - __CILKRTS_ASSERT(0 == m_exemplar); } template @@ -699,7 +698,6 @@ namespace cilk { { // Called only by delete_self, which deleted the functor using an // allocator. - __CILKRTS_ASSERT(0 == m_functor); } template @@ -917,7 +915,6 @@ namespace cilk { /// Return a pointer to size bytes of raw memory void* allocate(std::size_t s) const { - __CILKRTS_ASSERT(sizeof(Type) == s); return m_allocator.allocate(1); } @@ -931,7 +928,6 @@ namespace cilk { } void swap(holder_monoid& other) { - __CILKRTS_ASSERT(m_allocator == other.m_allocator); std::swap(m_initializer, other.m_initializer); } diff --git a/include/cilk/hyperobject_base.h b/include/cilk/hyperobject_base.h index 89858256..006aa721 100644 --- a/include/cilk/hyperobject_base.h +++ b/include/cilk/hyperobject_base.h @@ -9,13 +9,15 @@ extern "C" { #endif +struct __cilkrts_hyperobject_base; + /* Callback function signatures. The first argument always points to the * reducer itself and is commonly ignored. */ typedef void (*cilk_reduce_fn_t)(void *r, void *lhs, void *rhs); typedef void (*cilk_identity_fn_t)(void *r, void *view); typedef void (*cilk_destroy_fn_t)(void *r, void *view); -typedef void *(*cilk_allocate_fn_t)(void *r, size_t bytes); -typedef void (*cilk_deallocate_fn_t)(void *r, void *view); +typedef void *(*cilk_allocate_fn_t)(struct __cilkrts_hyperobject_base *r, size_t bytes); +typedef void (*cilk_deallocate_fn_t)(struct __cilkrts_hyperobject_base *r, void *view); /** Representation of the monoid */ typedef struct cilk_c_monoid { @@ -38,9 +40,12 @@ typedef struct __cilkrts_hyperobject_base { TODO: Add optimization hints like "strand pure" as in Cilk Plus. */ void __cilkrts_hyper_create(__cilkrts_hyperobject_base *key); void __cilkrts_hyper_destroy(__cilkrts_hyperobject_base *key); +#if defined __clang__ && defined __cilk && __cilk >= 300 +__attribute__((strand_pure, strand_malloc)) +#endif void *__cilkrts_hyper_lookup(__cilkrts_hyperobject_base *key); -void *__cilkrts_hyper_alloc(void *ignore, size_t bytes); -void __cilkrts_hyper_dealloc(void *ignore, void *view); +void *__cilkrts_hyper_alloc(__cilkrts_hyperobject_base *key, size_t bytes); +void __cilkrts_hyper_dealloc(__cilkrts_hyperobject_base *key, void *view); #ifdef __cplusplus } /* end extern "C" */ diff --git a/include/cilk/reducer_list.h b/include/cilk/reducer_list.h index d2058658..73ff2247 100644 --- a/include/cilk/reducer_list.h +++ b/include/cilk/reducer_list.h @@ -588,8 +588,6 @@ class op_list_append_view : public internal::list_view_base */ void reduce(op_list_append_view* right) { - __CILKRTS_ASSERT( - this->m_value.get_allocator() == right->m_value.get_allocator()); this->m_value.splice(end(), right->m_value); } }; @@ -737,8 +735,6 @@ class op_list_prepend_view : public internal::list_view_base */ void reduce(op_list_prepend_view* right) { - __CILKRTS_ASSERT( - this->m_value.get_allocator() == right->m_value.get_allocator()); this->m_value.splice(begin(), right->m_value); } }; diff --git a/include/cilk/reducer_min_max.h b/include/cilk/reducer_min_max.h index 266f03f2..947dad09 100644 --- a/include/cilk/reducer_min_max.h +++ b/include/cilk/reducer_min_max.h @@ -1090,7 +1090,6 @@ template class rhs_proxy { // Checks matching view, then return value (called from view_base::assign). value_type value(const typename View::base *view) const { - __CILKRTS_ASSERT(view == m_view); return m_value; } diff --git a/include/cilk/reducer_opadd.h b/include/cilk/reducer_opadd.h index ead0fd21..832ae356 100644 --- a/include/cilk/reducer_opadd.h +++ b/include/cilk/reducer_opadd.h @@ -349,7 +349,6 @@ class op_add_view : public scalar_view * @see rhs_proxy */ op_add_view& operator=(const rhs_proxy& rhs) { - __CILKRTS_ASSERT(this == rhs.m_view); this->m_value += rhs.m_value; return *this; } diff --git a/include/cilk/reducer_opand.h b/include/cilk/reducer_opand.h index 8daa332d..46c39fcd 100644 --- a/include/cilk/reducer_opand.h +++ b/include/cilk/reducer_opand.h @@ -282,7 +282,6 @@ class op_and_view : public scalar_view * @see rhs_proxy */ op_and_view& operator=(const rhs_proxy& rhs) { - __CILKRTS_ASSERT(this == rhs.m_view); this->m_value &= rhs.m_value; return *this; } diff --git a/include/cilk/reducer_opmul.h b/include/cilk/reducer_opmul.h index 8ac3e693..d2139b24 100644 --- a/include/cilk/reducer_opmul.h +++ b/include/cilk/reducer_opmul.h @@ -291,7 +291,6 @@ class op_mul_view : public scalar_view * @see rhs_proxy */ op_mul_view& operator=(const rhs_proxy& rhs) { - __CILKRTS_ASSERT(this == rhs.m_view); this->m_value *= rhs.m_value; return *this; } diff --git a/include/cilk/reducer_opor.h b/include/cilk/reducer_opor.h index bde361cf..20ae2d1e 100644 --- a/include/cilk/reducer_opor.h +++ b/include/cilk/reducer_opor.h @@ -279,7 +279,6 @@ class op_or_view : public scalar_view * @see rhs_proxy */ op_or_view& operator=(const rhs_proxy& rhs) { - __CILKRTS_ASSERT(this == rhs.m_view); this->m_value |= rhs.m_value; return *this; } diff --git a/include/cilk/reducer_opxor.h b/include/cilk/reducer_opxor.h index 3d33a49b..2e724a5c 100644 --- a/include/cilk/reducer_opxor.h +++ b/include/cilk/reducer_opxor.h @@ -278,7 +278,6 @@ class op_xor_view : public scalar_view * @see rhs_proxy */ op_xor_view& operator=(const rhs_proxy& rhs) { - __CILKRTS_ASSERT(this == rhs.m_view); this->m_value ^= rhs.m_value; return *this; } diff --git a/reducer_bench/Makefile b/reducer_bench/Makefile index 35ca1b8b..49473782 100644 --- a/reducer_bench/Makefile +++ b/reducer_bench/Makefile @@ -42,29 +42,29 @@ $(DIRTESTS): check: $(MAKE) TIMING_COUNT=5 $(TOPASS) - ./intlist --cheetah-nproc 2 40000000 - ./serialsum --cheetah-nproc 1 200000000 - ./intsum --cheetah-nproc 1 200000000 - ./intsum --cheetah-nproc $(MANY) 200000000 - ./multispawnsum --cheetah-nproc 2 100000000 - ./cppsum --cheetah-nproc 2 200000000 + CILK_NWORKERS=2 ./intlist 40000000 + CILK_NWORKERS=1 ./serialsum 200000000 + CILK_NWORKERS=1 ./intsum 200000000 + CILK_NWORKERS=$(MANY) ./intsum 200000000 + CILK_NWORKERS=2 ./multispawnsum 100000000 + CILK_NWORKERS=2 ./cppsum 200000000 $(MAKE) -C nqueens check $(TOPASS) if $(ENABLE_X11); then $(MAKE) -C quad_tree check $(TOPASS) ; else : ; fi stress: $(MAKE) TIMING_COUNT=5 $(TOPASS) - ./intlist --cheetah-nproc $(MANY) 40000000 - ./intsum --cheetah-nproc $(MANY) 200000000 + CILK_NWORKERS=$(MANY) ./intlist 40000000 + CILK_NWORKERS=$(MANY) ./intsum 200000000 if $(ENABLE_X11); then $(MAKE) -C quad_tree check $(TOPASS) ; else : ; fi # Assertion failure: SPA resize not supported yet! -# ./repeatedintsum --cheetah-nproc $(MANY) 10000000 +# CILK_NWORKERS=$(MANY) ./repeatedintsum 10000000 #redcheck: # $(MAKE) clean; $(MAKE) TIMING_COUNT=1 $(TOPASS) > /dev/null 2>&1 -# ./intlist --cheetah-nproc 2 2048 +# CILK_NWORKERS=2 ./intlist 2048 #redgdb: # $(MAKE) clean; $(MAKE) TIMING_COUNT=1 $(TOPASS) > /dev/null 2>&1 -# gdb --args ./intlist --cheetah-nproc 2 2048 +# CILK_NWORKERS=2 gdb --args ./intlist 2048 clean: rm -f *.o *~ $(CTESTS) $(CXXTESTS) core.* diff --git a/reducer_bench/intsum.c b/reducer_bench/intsum.c index bedb799a..dedb2552 100644 --- a/reducer_bench/intsum.c +++ b/reducer_bench/intsum.c @@ -22,6 +22,7 @@ void compute_sum(long limit, int scale) { } void test_reducer(long limit) { + #pragma GCC diagnostic ignored "-Wpass-failed" for (int t = 1; t < 100; ++t) { cilk_spawn compute_sum(limit, t); } diff --git a/reducer_bench/nqueens/Makefile b/reducer_bench/nqueens/Makefile index 6505ac9d..6518bc3c 100644 --- a/reducer_bench/nqueens/Makefile +++ b/reducer_bench/nqueens/Makefile @@ -2,17 +2,17 @@ include ../../config.mk ABI=opencilk -OPTIONS = -g -Wall $(OPT) -ftapir=$(ABI) +OPTIONS = $(OPT) $(DBG) $(ARCH) -Wall TIMING_COUNT := 1 -CFLAGS := -std=gnu99 -ftapir -Wall - ifneq ($(DEBUG),1) OPTIONS += -DNDEBUG endif -LDFLAGS := -lrt -lm -lcilkrts +CFLAGS := -std=gnu99 -fopencilk $(OPTIONS) + +LDFLAGS := -lrt -lm RTS_LIBS = @@ -28,12 +28,12 @@ nqueens: board.o nqueens.o ktiming.o # $(CC) -o queens board.o queens.o $(LDFLAGS) -%.o: %.c - $(CC) -c $(OPTIONS) -DTIMING_COUNT=$(TIMING_COUNT) -o $@ $< +#%.o: %.c +# $(CC) -c $(OPTIONS) -DTIMING_COUNT=$(TIMING_COUNT) -o $@ $< check: nqueens_int nqueens - ./nqueens_int --nproc 2 - ./nqueens --nproc 2 + CILK_NWORKERS=2 ./nqueens_int + CILK_NWORKERS=2 ./nqueens clean: rm -f nqueens_int nqueens *.o *~ core.* diff --git a/reducer_bench/nqueens/nqueens.c b/reducer_bench/nqueens/nqueens.c index 0c7abb33..57b6c11e 100644 --- a/reducer_bench/nqueens/nqueens.c +++ b/reducer_bench/nqueens/nqueens.c @@ -131,17 +131,21 @@ int run_queens(bool verbose) { } } CILK_C_UNREGISTER_REDUCER(X); + delete_nodes(&board_list); return num_solutions; } int main(int argc, char *argv[]) { - int i; clockmark_t begin, end; uint64_t running_time[TIMING_COUNT]; + int count = argc > 1 ? atoi(argv[1]) : -1; + if (count <= 0) + count = TIMING_COUNT; + int num_solutions = 92, res = 0; - for (i = 0; i < TIMING_COUNT; i++) { + for (int i = 0; i < count; i++) { begin = ktiming_getmark(); int run_solutions = run_queens(false); @@ -149,13 +153,14 @@ int main(int argc, char *argv[]) { end = ktiming_getmark(); running_time[i] = ktiming_diff_usec(&begin, &end); } - if (res == TIMING_COUNT) { + if (res == count) { printf("Success\n"); } else { - printf("Result: %d/%d successes (%d failures)\n", res, TIMING_COUNT, - TIMING_COUNT - res); + printf("Result: %d/%d successes (%d failures)\n", res, count, count - res); + } + if (count > 0) { + print_runtime(running_time, TIMING_COUNT); } - print_runtime(running_time, TIMING_COUNT); return res != TIMING_COUNT; } diff --git a/reducer_bench/quad_tree/Makefile b/reducer_bench/quad_tree/Makefile index b2a2c394..b729a290 100644 --- a/reducer_bench/quad_tree/Makefile +++ b/reducer_bench/quad_tree/Makefile @@ -61,7 +61,7 @@ screensaver: $(OBJS) graphic_stuff.o # $(CXX) $(CXXFLAGS) $(EXTRA_CXXFLAGS) -o $@ -c $< check: screensaver - ./screensaver --nproc 2 1000 + ./screensaver --cheetah-nproc 2 1000 clean: rm -f screensaver *.o *.out *~ core.* diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 43d0707a..5c185dea 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -4,23 +4,31 @@ set(CHEETAH_LIB_CMAKEFILES_DIR "${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTO set(CHEETAH_SOURCES c_reducers.c cilk2c.c + cilk2c_inlined.c cilkred_map.c closure.c - cmdline.c debug.c fiber.c fiber-pool.c global.c init.c internal-malloc.c - invoke-main.c mutex.c personality.c readydeque.c reducer_impl.c sched_stats.c scheduler.c - ZERO.c +) + +# We assume there is just one source file to compile for the cheetah +# ABI. +set(CHEETAH_ABI_SOURCE + cilk2c_inlined.c +) + +set(CHEETAH_PEDIGREE_GLOBALS_SOURCES + pedigree_globals.c ) set(CHEETAH_PERSONALITY_C_SOURCES @@ -42,9 +50,7 @@ else() endif() add_flags_if_supported(-g3) -add_flags_if_supported(-mavx) -add_flags_if_supported(-march=sandybridge) -add_flags_if_supported(-fPIC) +add_flags_if_supported(-Wno-covered-switch-default) if (CHEETAH_HAS_FOMIT_FRAME_POINTER_FLAG) set_source_files_properties(invoke-main.c PROPERTIES COMPILE_FLAGS -fno-omit-frame-pointer) endif() @@ -56,8 +62,28 @@ list(APPEND CHEETAH_COMPILE_DEFS OPENCILK_LIBRARY) set(CHEETAH_DEBUG_OPTIONS -Og) set(CHEETAH_RELEASE_OPTIONS -O3) +# Setup flags and defs for cheetah bitcode ABI build +set(CHEETAH_BITCODE_ABI_COMPILE_FLAGS ${CHEETAH_COMPILE_FLAGS} -emit-llvm) +set(CHEETAH_BITCODE_ABI_COMPILE_DEFS ${CHEETAH_COMPILE_DEFS} + "CHEETAH_API=" + "CHEETAH_INTERNAL=" + "CHEETAH_INTERNAL_NORETURN=__attribute__((noreturn))" + "CILK_DEBUG=0") +set(CHEETAH_BITCODE_PEDIGREE_ABI_COMPILE_DEFS ${CHEETAH_BITCODE_ABI_COMPILE_DEFS} + "ENABLE_CILKRTS_PEDIGREE=1") + +# Add compile flags for Cheetah-runtime compilation that should be +# excluded from bitcode compilation +if (CHEETAH_HAS_MAVX_FLAG) + list(APPEND CHEETAH_COMPILE_FLAGS -mavx) +endif() +if (CHEETAH_HAS_MARCH_SANDYBRIDGE_FLAG) + list(APPEND CHEETAH_COMPILER_FLAGS -march=sandybridge) +endif() + split_list(CHEETAH_COMPILE_FLAGS) split_list(CHEETAH_LINK_FLAGS) +split_list(CHEETAH_BITCODE_ABI_COMPILE_FLAGS) # Build the shared library. if (CHEETAH_ENABLE_SHARED) @@ -90,6 +116,7 @@ if (CHEETAH_ENABLE_SHARED) OUTPUT_NAME "opencilk-personality-c" ) target_compile_options(cheetah_personality_c_shared PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_personality_c_shared PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") list(APPEND CHEETAH_BUILD_TARGETS "cheetah_personality_c_shared") if (CHEETAH_INSTALL_SHARED_LIBRARY) list(APPEND CHEETAH_INSTALL_TARGETS "cheetah_personality_c_shared") @@ -105,11 +132,30 @@ if (CHEETAH_ENABLE_SHARED) OUTPUT_NAME "opencilk-personality-cpp" ) target_compile_options(cheetah_personality_cpp_shared PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_personality_cpp_shared PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") list(APPEND CHEETAH_BUILD_TARGETS "cheetah_personality_cpp_shared") if (CHEETAH_INSTALL_SHARED_LIBRARY) list(APPEND CHEETAH_INSTALL_TARGETS "cheetah_personality_cpp_shared") endif() + add_library(cheetah_pedigree_globals_shared SHARED ${CHEETAH_PEDIGREE_GLOBALS_SOURCES}) + target_link_libraries(cheetah_pedigree_globals_shared ${CHEETAH_LIBRARIES}) + set_target_properties(cheetah_pedigree_globals_shared + PROPERTIES + COMPILE_FLAGS "${CHEETAH_COMPILE_FLAGS}" + COMPILE_DEFINITIONS "${CHEETAH_COMPILE_DEFS}" + LINK_FLAGS "${CHEETAH_LINK_FLAGS}" + OUTPUT_NAME "opencilk-pedigrees" + VERSION "${CHEETAH_ABI_VERSION}.0" + SOVERSION "${CHEETAH_ABI_VERSION}" + ) + target_compile_options(cheetah_pedigree_globals_shared PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_pedigree_globals_shared PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") + list(APPEND CHEETAH_BUILD_TARGETS "cheetah_pedigree_globals_shared") + if (CHEETAH_INSTALL_SHARED_LIBRARY) + list(APPEND CHEETAH_INSTALL_TARGETS "cheetah_pedigree_globals_shared") + endif() + endif() # Build the static library. @@ -142,6 +188,7 @@ if (CHEETAH_ENABLE_STATIC) OUTPUT_NAME "opencilk-personality-c" ) target_compile_options(cheetah_personality_c_static PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_personality_c_static PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") list(APPEND CHEETAH_BUILD_TARGETS "cheetah_personality_c_static") if (CHEETAH_INSTALL_SHARED_LIBRARY) list(APPEND CHEETAH_INSTALL_TARGETS "cheetah_personality_c_static") @@ -157,17 +204,95 @@ if (CHEETAH_ENABLE_STATIC) OUTPUT_NAME "opencilk-personality-cpp" ) target_compile_options(cheetah_personality_cpp_static PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_personality_cpp_static PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") list(APPEND CHEETAH_BUILD_TARGETS "cheetah_personality_cpp_static") if (CHEETAH_INSTALL_SHARED_LIBRARY) list(APPEND CHEETAH_INSTALL_TARGETS "cheetah_personality_cpp_static") endif() endif() +# Build the Cheetah ABI as LLVM bitcode +if (CHEETAH_ENABLE_BITCODE_ABI AND (CMAKE_C_COMPILER_ID STREQUAL "Clang")) + set(CHEETAH_ABI_BITCODE_FILE "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopencilk-abi.bc") + # Commands to compile the source files for the ABI into bitcode. + add_library(cheetah_abi_bc_compile OBJECT ${CHEETAH_ABI_SOURCE}) + + set_target_properties(cheetah_abi_bc_compile + PROPERTIES + COMPILE_FLAGS "${CHEETAH_BITCODE_ABI_COMPILE_FLAGS}" + COMPILE_DEFINITIONS "${CHEETAH_BITCODE_ABI_COMPILE_DEFS}" + ) + target_compile_options(cheetah_abi_bc_compile PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_abi_bc_compile PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") + + # Custom command to generate the ABI bitcode file. We assume that + # cheetah_abi_bc_compile generates just one output, so there's + # nothing to link together. + add_custom_command(OUTPUT ${CHEETAH_ABI_BITCODE_FILE} + DEPENDS cheetah_abi_bc_compile ${CHEETAH_ABI_SOURCE} + COMMAND cp $ ${CHEETAH_ABI_BITCODE_FILE} + COMMENT "Generating ${CHEETAH_ABI_BITCODE_FILE}" + VERBATIM + ) + + # Add a top-level custom target to drive the creation of the ABI + # bitcode file. + add_custom_target(cheetah_abi_bc ALL DEPENDS ${CHEETAH_ABI_BITCODE_FILE}) + list(APPEND CHEETAH_BUILD_TARGETS "cheetah_abi_bc") + # Because the ABI bitcode file is a custom target, we cannot use + # CMake's install(TARGETS) to install it. Instead, we use + # install(FILES) to install bitcode ABI file. + if (CHEETAH_INSTALL_BITCODE_ABI) + list(APPEND CHEETAH_INSTALL_FILES ${CHEETAH_ABI_BITCODE_FILE}) + endif() +endif() + +# Build the Cheetah ABI as LLVM bitcode (for pedigrees) +if (CHEETAH_ENABLE_BITCODE_ABI AND (CMAKE_C_COMPILER_ID STREQUAL "Clang")) + set(CHEETAH_PEDIGREE_ABI_BITCODE_FILE "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopencilk-pedigrees-abi.bc") + # Commands to compile the source files for the ABI into bitcode. + add_library(cheetah_pedigree_abi_bc_compile OBJECT ${CHEETAH_ABI_SOURCE}) + + set_target_properties(cheetah_pedigree_abi_bc_compile + PROPERTIES + COMPILE_FLAGS "${CHEETAH_BITCODE_ABI_COMPILE_FLAGS}" + COMPILE_DEFINITIONS "${CHEETAH_BITCODE_PEDIGREE_ABI_COMPILE_DEFS}" + ) + target_compile_options(cheetah_pedigree_abi_bc_compile PUBLIC "$<$:${CHEETAH_DEBUG_OPTIONS}>") + target_compile_options(cheetah_pedigree_abi_bc_compile PUBLIC "$<$:${CHEETAH_RELEASE_OPTIONS}>") + + # Custom command to generate the ABI bitcode file. We assume that + # cheetah_abi_bc_compile generates just one output, so there's + # nothing to link together. + add_custom_command(OUTPUT ${CHEETAH_PEDIGREE_ABI_BITCODE_FILE} + DEPENDS cheetah_pedigree_abi_bc_compile ${CHEETAH_ABI_SOURCE} + COMMAND cp $ ${CHEETAH_PEDIGREE_ABI_BITCODE_FILE} + COMMENT "Generating ${CHEETAH_PEDIGREE_ABI_BITCODE_FILE}" + VERBATIM + ) + + # Add a top-level custom target to drive the creation of the ABI + # bitcode file. + add_custom_target(cheetah_pedigree_abi_bc ALL DEPENDS ${CHEETAH_PEDIGREE_ABI_BITCODE_FILE}) + list(APPEND CHEETAH_BUILD_TARGETS "cheetah_pedigree_abi_bc") + # Because the ABI bitcode file is a custom target, we cannot use + # CMake's install(TARGETS) to install it. Instead, we use + # install(FILES) to install bitcode ABI file. + if (CHEETAH_INSTALL_BITCODE_ABI) + list(APPEND CHEETAH_INSTALL_FILES ${CHEETAH_PEDIGREE_ABI_BITCODE_FILE}) + endif() +endif() + + + if (CHEETAH_INSTALL_LIBRARY) install(TARGETS ${CHEETAH_INSTALL_TARGETS} LIBRARY DESTINATION ${CHEETAH_LIBRARY_INSTALL_DIR} COMPONENT cheetah ARCHIVE DESTINATION ${CHEETAH_LIBRARY_INSTALL_DIR} COMPONENT cheetah ) + install(FILES ${CHEETAH_INSTALL_FILES} + DESTINATION ${CHEETAH_LIBRARY_INSTALL_DIR} COMPONENT cheetah + ) endif() # Add a meta-target for both libraries. diff --git a/runtime/Makefile b/runtime/Makefile index ae9c8e3c..764ba0ab 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -6,10 +6,11 @@ REDUCER_DEF = -DREDUCER_MODULE #ALERT_DEF = -DALERT_LVL=0x000 MAIN = $(RTS_LIB) +BITCODE_ABI = $(MAIN)-abi.bc SRCS = $(filter-out $(PERSON_C_SRC).c, $(filter-out $(PERSON_CPP_SRC).c, $(wildcard *.c))) HDRS = $(wildcard *.h) OBJS = $(patsubst %.c,./build/%.o,$(SRCS)) -INCLUDES = +INCLUDES = -I../include/ DEFINES = $(REDUCER_DEF) $(ABI_DEF) $(ALERT_DEF) OPTIONS = $(OPT) $(DBG) $(ARCH) -Werror -Wall -fpic $(DEFINES) $(INCLUDES) @@ -19,8 +20,8 @@ PERSON_C_SRC = personality-c PERSON_CPP_SRC = personality-cpp .PHONY: all clean build - -all: build $(MAIN).a $(MAIN).so $(PERSON_C).a $(PERSON_C).so $(PERSON_CPP).a $(PERSON_CPP).so + +all: build $(MAIN).a $(MAIN).so $(PERSON_C).a $(PERSON_C).so $(PERSON_CPP).a $(PERSON_CPP).so $(BITCODE_ABI) $(MAIN).a: $(OBJS) ar rcs $@ $^ @@ -46,5 +47,19 @@ build: build/%.o: %.c $(HDRS) $(CC) -c $(OPTIONS) -o $@ $< + +build/pedigree_globals.a: build/pedigree_globals.o + ar rcs $@ $^ + +build/libpedigree_globals.so: build/pedigree_globals.o + $(CC) -shared -o $@ $^ + +build/cilk2c_inlined.bc: cilk2c_inlined.c $(HDRS) + $(CC) -O3 -DCHEETAH_API="" -DCHEETAH_INTERNAL_NORETURN='__attribute((noreturn))' -DCHEETAH_INTERNAL="" -DCILK_DEBUG=0 -DENABLE_CILKRTS_PEDIGREE=1 -c -emit-llvm $(INCLUDES) -o $@ $< + +$(BITCODE_ABI) : build/cilk2c_inlined.bc + cp $< $@ + clean: - rm -f $(OBJS) *.a *.so *~ + rm -f $(OBJS) $(BITCODE_ABI) *.a *.so *~ + rm -f build/cilk2c_inlined.bc diff --git a/runtime/TODOS.txt b/runtime/TODOS.txt index 28a51ee3..02dac39f 100644 --- a/runtime/TODOS.txt +++ b/runtime/TODOS.txt @@ -1,3 +1,62 @@ +Questions for TB / John: + +Q: Where is __default_cilkrts_startup and __default_cilkrts_shutdown inserted? + Is it inserted by the compiler as a global constructor / destructor? +A: No __default_cilkrts_startup/shutdown are not implicitly inserted by the +compiler. because those functions are marked with __attribute__((constructor))/ +__attribute__((destructor)), the linker and loader know to run those functions +before/after the execution of main (just standard linker / loader features) + +Q: Is __cilkrts_atinit and __cilkrts_atexit currently used? +A: No. Plan to remove. + +Q: using cmake in the runtime didn't work for me + +Q: Why is there a __cilk_personality_c_v0 and a __cilk_personality_cpp_v0? +They all seem to call __cilk_personality_internal anyway? + +Q: There are various ifdef used: CPU_SETSIZE, _SC_NPROCESSORS_ONLN, CHEETAH_SAVE_MXCSR, +Assume compiler generates this: __i386__, __BSD__, __SSE__, __AVX__ + +Q: IS CHECK_CILK_FRAME_MAGIC still necessary now that we have moved to bitcode +based approach? + +Q: What is the purpose of #undef Closure_assert_ownership / Closure_assert_alienation + +Q: Why do we need worker_state_change? Isn't that similar to the sched_state code? + +Q: The cilkified thread, after woken up by the last worker leaving cilkified region, +will execute leave_frame and Cilk_set_return ... why is this OK? + +Q: Do we want to improve the support for reducer? i.e., remove cap limit and +also incur less overhead if the lookup succeeds (need to inline lookup in the +reducer header file), also make the vinfo and log array part of the cilkred_map +and make the map itself a linked list of pages (4K each) + +Q: In the reducer_impl.c, there is a comment that says: +"Special case, if left is leftmost, then always merge into it. + For C reducers this forces lazy creation of the leftmost views." +What's that comment about? (is_leftmost always returns false). +If the leftmost is just a pointer anyway why does it matter? + +Q: Besides LONG_BIT, what else do we need from GNU_SOURCE? Should +we depend on GNU_SOURCE? (How does that work on Mac / Windows?) +(Maybe also __builtin_ctzl?) + +NOTE: +- Remove invoke-main.c entirely (or 'make' won't work) +- Remove cmdline.c entirely (deprecating cmdline arg) +- Currently need to copy over libopencilk-abi.bc into the build ... ideally +it would be nice to provide a compiler flag to specify the right ABI +- Tim is working on Pedigree + + +Check: +- Does __cilkrts_check_excpetion_raise only raises the exception from +a frame where the exception will be handled? (It retrieves t->user_exn +and raises that.) No it doesn't seem so. It seems that this is always +called after a successful non-trivial sync. + - Change max_fibers to per worker max fibers - Rethink fiber stats - Incorporate the timing stats into scheduler diff --git a/runtime/ZERO.c b/runtime/ZERO.c deleted file mode 100644 index 5e2cf9d4..00000000 --- a/runtime/ZERO.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "cilk2c.h" - -unsigned long cilkrts_zero = 0; diff --git a/runtime/cilk-internal.h b/runtime/cilk-internal.h index 6dd429ac..f4519e83 100644 --- a/runtime/cilk-internal.h +++ b/runtime/cilk-internal.h @@ -26,15 +26,12 @@ typedef struct local_state local_state; // Cilk stack frame related defs //=============================================== + + /** * Every spawning function has a frame descriptor. A spawning function * is a function that spawns or detaches. Only spawning functions * are visible to the Cilk runtime. - * - * NOTE: if you are using the Tapir compiler, you should not change - * these fields; ok to change for hand-compiled code. - * See Tapir compiler ABI: - * https://github.com/wsmoses/Tapir-LLVM/blob/cilkr/lib/Transforms/Tapir/CilkRABI.cpp */ struct __cilkrts_stack_frame { // Flags is a bitfield with values defined below. Client code @@ -52,7 +49,7 @@ struct __cilkrts_stack_frame { // The client copies the worker from TLS here when initializing // the structure. The runtime ensures that the field always points // to the __cilkrts_worker which currently "owns" the frame. - __cilkrts_worker *worker; + _Atomic(__cilkrts_worker *) worker; // Before every spawn and nontrivial sync the client function // saves its continuation here. @@ -71,6 +68,13 @@ struct __cilkrts_stack_frame { uint32_t reserved1; #endif #endif + +#ifdef ENABLE_CILKRTS_PEDIGREE + __cilkrts_pedigree pedigree; // Fields for pedigrees. + int64_t rank; + uint64_t dprng_dotproduct; + int64_t dprng_depth; +#endif }; //=========================================================== @@ -110,8 +114,30 @@ struct __cilkrts_stack_frame { // function. #define CILK_FRAME_SYNC_READY 0x200 -#define GET_CILK_FRAME_MAGIC(F) ((F)->magic) -#define CHECK_CILK_FRAME_MAGIC(G, F) ((G)->frame_magic == (F)->magic) +static const uint32_t frame_magic = + ((((((((((((__CILKRTS_ABI_VERSION * 13) + + offsetof(struct __cilkrts_stack_frame, worker)) * + 13) + + offsetof(struct __cilkrts_stack_frame, ctx)) * + 13) + + offsetof(struct __cilkrts_stack_frame, magic)) * + 13) + + offsetof(struct __cilkrts_stack_frame, flags)) * + 13) + + offsetof(struct __cilkrts_stack_frame, call_parent)) +#if defined __i386__ || defined __x86_64__ + * 13) +#ifdef __SSE__ + + offsetof(struct __cilkrts_stack_frame, mxcsr)) +#else + + offsetof(struct __cilkrts_stack_frame, reserved1)) +#endif +#else + )) +#endif + ; + +#define CHECK_CILK_FRAME_MAGIC(G, F) (frame_magic == (F)->magic) //=========================================================== // Helper functions for the flags field in cilkrts_stack_frame @@ -167,21 +193,6 @@ enum __cilkrts_worker_state { WORKER_RUN }; -struct local_state { - __cilkrts_stack_frame **shadow_stack; - - unsigned short state; /* __cilkrts_worker_state */ - bool lock_wait; - bool provably_good_steal; - unsigned int rand_next; - - jmpbuf rts_ctx; - struct cilk_fiber_pool fiber_pool; - struct cilk_im_desc im_desc; - struct cilk_fiber *fiber_to_free; - struct sched_stats stats; -}; - /** * NOTE: if you are using the Tapir compiler, you should not change * these fields; ok to change for hand-compiled code. diff --git a/runtime/cilk2c.c b/runtime/cilk2c.c index 5933a56e..353472f2 100644 --- a/runtime/cilk2c.c +++ b/runtime/cilk2c.c @@ -19,6 +19,21 @@ CHEETAH_INTERNAL unsigned cilkg_nproc = 0; CHEETAH_INTERNAL struct cilkrts_callbacks cilkrts_callbacks = { 0, 0, false, {NULL}, {NULL}}; +// Internal method to get the Cilk worker ID. Intended for debugging purposes. +// +// TODO: Figure out how we want to support worker-local storage. +unsigned __cilkrts_get_worker_number(void) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (w) + return w->self; + // Use the last exiting worker from default_cilkrts instead + return default_cilkrts->exiting_worker; +} + +// Test if the Cilk runtime has been initialized. This method is intended to +// help initialization of libraries that depend on the OpenCilk runtime. +int __cilkrts_is_initialized(void) { return NULL != default_cilkrts; } + // These callback-registration methods can run before the runtime system has // started. // @@ -46,78 +61,6 @@ int __cilkrts_atexit(void (*callback)(void)) { return 0; } -// Internal method to get the Cilk worker ID. Intended for debugging purposes. -// -// TODO: Figure out how we want to support worker-local storage. -unsigned __cilkrts_get_worker_number(void) { - return __cilkrts_get_tls_worker()->self; -} - -#ifdef __linux__ /* This feature requires the GNU linker */ -CHEETAH_INTERNAL -const char get_workerwarn_msg[] - __attribute__((section(".gnu.warning.__cilkrts_get_worker_number"))) = - "__cilkrts_get_worker_number is deprecated"; -#endif - -// ================================================================ -// This file contains the compiler ABI, which corresponds to -// conceptually what the compiler generates to implement Cilk code. -// They are included here in part as documentation, and in part -// allow one to write and run "hand-compiled" Cilk code. -// ================================================================ - -// inlined by the compiler -void __cilkrts_enter_frame(__cilkrts_stack_frame *sf) { - __cilkrts_worker *w = __cilkrts_get_tls_worker(); - cilkrts_alert(ALERT_CFRAME, w, "__cilkrts_enter_frame %p", sf); - - sf->flags = 0; - sf->magic = w->g->frame_magic; - sf->call_parent = w->current_stack_frame; - sf->worker = w; - w->current_stack_frame = sf; - // WHEN_CILK_DEBUG(sf->magic = CILK_STACKFRAME_MAGIC); -} - -// inlined by the compiler; this implementation is only used in invoke-main.c -void __cilkrts_enter_frame_fast(__cilkrts_stack_frame *sf) { - __cilkrts_worker *w = __cilkrts_get_tls_worker(); - cilkrts_alert(ALERT_CFRAME, w, "__cilkrts_enter_frame_fast %p", sf); - - sf->flags = 0; - sf->magic = w->g->frame_magic; - sf->call_parent = w->current_stack_frame; - sf->worker = w; - w->current_stack_frame = sf; -} - -// inlined by the compiler; this implementation is only used in invoke-main.c -void __cilkrts_detach(__cilkrts_stack_frame *sf) { - struct __cilkrts_worker *w = sf->worker; - cilkrts_alert(ALERT_CFRAME, w, "__cilkrts_detach %p", sf); - - CILK_ASSERT(w, CHECK_CILK_FRAME_MAGIC(w->g, sf)); - CILK_ASSERT(w, sf->worker == __cilkrts_get_tls_worker()); - CILK_ASSERT(w, w->current_stack_frame == sf); - - struct __cilkrts_stack_frame *parent = sf->call_parent; - struct __cilkrts_stack_frame **tail = - atomic_load_explicit(&w->tail, memory_order_relaxed); - CILK_ASSERT(w, (tail + 1) < w->ltq_limit); - - // store parent at *tail, and then increment tail - *tail++ = parent; - sf->flags |= CILK_FRAME_DETACHED; - /* Release ordering ensures the two preceding stores are visible. */ - atomic_store_explicit(&w->tail, tail, memory_order_release); -} - -// inlined by the compiler; this implementation is only used in invoke-main.c -void __cilkrts_save_fp_ctrl_state(__cilkrts_stack_frame *sf) { - sysdep_save_fp_ctrl_state(sf); -} - // Called after a normal cilk_sync (i.e. not the cilk_sync called in the // personality function.) Checks if there is an exception that needs to be // propagated. This is called from the frame that will handle whatever exception @@ -199,6 +142,11 @@ void __cilkrts_cleanup_fiber(__cilkrts_stack_frame *sf, int32_t sel) { SP(sf) = (void *)t->parent_rsp; t->parent_rsp = NULL; + if (t->saved_throwing_fiber) { + cilk_fiber_deallocate_to_pool(w, t->saved_throwing_fiber); + t->saved_throwing_fiber = NULL; + } + deque_unlock_self(w); __builtin_longjmp(sf->ctx, 1); // Does not return return; @@ -221,26 +169,10 @@ void __cilkrts_sync(__cilkrts_stack_frame *sf) { } } -// inlined by the compiler; this implementation is only used in invoke-main.c -void __cilkrts_pop_frame(__cilkrts_stack_frame *sf) { - __cilkrts_worker *w = sf->worker; - cilkrts_alert(ALERT_CFRAME, w, "__cilkrts_pop_frame %p", sf); - - CILK_ASSERT(w, CHECK_CILK_FRAME_MAGIC(w->g, sf)); - CILK_ASSERT(w, sf->worker == __cilkrts_get_tls_worker()); - /* The inlined version in the Tapir compiler uses release - semantics for the store to call_parent, but relaxed - order may be acceptable for both. A thief can't see - these operations until the Dekker protocol with a - memory barrier has run. */ - w->current_stack_frame = sf->call_parent; - sf->call_parent = 0; -} - void __cilkrts_pause_frame(__cilkrts_stack_frame *sf, char *exn) { __cilkrts_worker *w = sf->worker; - cilkrts_alert(ALERT_CFRAME, w, "__cilkrts_pause_frame %p", sf); + cilkrts_alert(CFRAME, w, "__cilkrts_pause_frame %p", (void *)sf); CILK_ASSERT(w, CHECK_CILK_FRAME_MAGIC(w->g, sf)); CILK_ASSERT(w, sf->worker == __cilkrts_get_tls_worker()); @@ -267,9 +199,8 @@ void __cilkrts_pause_frame(__cilkrts_stack_frame *sf, char *exn) { } void __cilkrts_leave_frame(__cilkrts_stack_frame *sf) { - __cilkrts_worker *w = sf->worker; - cilkrts_alert(ALERT_CFRAME, w, "__cilkrts_leave_frame %p", sf); + cilkrts_alert(CFRAME, w, "__cilkrts_leave_frame %p", (void *)sf); CILK_ASSERT(w, CHECK_CILK_FRAME_MAGIC(w->g, sf)); CILK_ASSERT(w, sf->worker == __cilkrts_get_tls_worker()); @@ -301,7 +232,7 @@ void __cilkrts_leave_frame(__cilkrts_stack_frame *sf) { // frame returning is done via a different protocol, which is // triggered in Cilk_exception_handler. if (sf->flags & CILK_FRAME_STOLEN) { // if this frame has a full frame - cilkrts_alert(ALERT_RETURN, w, + cilkrts_alert(RETURN, w, "__cilkrts_leave_frame parent is call_parent!"); // leaving a full frame; need to get the full frame of its call // parent back onto the deque diff --git a/runtime/cilk2c.h b/runtime/cilk2c.h index 62ac19d0..b24a9ae5 100644 --- a/runtime/cilk2c.h +++ b/runtime/cilk2c.h @@ -4,13 +4,9 @@ #include "cilk-internal.h" #include -// mainly used by invoke-main.c -CHEETAH_INTERNAL unsigned long cilkrts_zero; - -// These functions are mostly inlined by the compiler, except for -// __cilkrts_leave_frame. However, their implementations are also -// provided in cilk2c.c. The implementations in cilk2c.c are used -// by invoke-main.c and can be used to "hand compile" cilk code. +// ABI functions inlined by the compiler (provided as a bitcode file after +// compiling runtime) are defined in cilk2c_inline.c. +// ABI functions not inlined by the compiler are defined in cilk2c.c. CHEETAH_API void __cilkrts_enter_frame(__cilkrts_stack_frame *sf); CHEETAH_API void __cilkrts_enter_frame_fast(__cilkrts_stack_frame *sf); CHEETAH_API void __cilkrts_save_fp_ctrl_state(__cilkrts_stack_frame *sf); @@ -22,5 +18,8 @@ CHEETAH_API void __cilkrts_sync(__cilkrts_stack_frame *sf); CHEETAH_API void __cilkrts_pop_frame(__cilkrts_stack_frame *sf); CHEETAH_API void __cilkrts_pause_frame(__cilkrts_stack_frame *sf, char *exn); CHEETAH_API void __cilkrts_leave_frame(__cilkrts_stack_frame *sf); -CHEETAH_API unsigned __cilkrts_get_nworkers(void); +// Not marked as CHEETAH_API as it may be deprecated soon +unsigned __cilkrts_get_nworkers(void); +//CHEETAH_API int64_t* __cilkrts_get_pedigree(void); +//void __cilkrts_pedigree_bump_rank(void); #endif diff --git a/runtime/cilk2c_inlined.c b/runtime/cilk2c_inlined.c new file mode 100644 index 00000000..7fac080b --- /dev/null +++ b/runtime/cilk2c_inlined.c @@ -0,0 +1,243 @@ +// ================================================================ +// This file contains the compiler ABI, which corresponds to +// conceptually what the compiler generates to implement Cilk code. +// They are included here in part as documentation, and in part +// allow one to write and run "hand-compiled" Cilk code. +// ================================================================ + +#include +#include +#include + +#include "cilk-internal.h" +#include "cilk2c.h" +#include "debug.h" +#include "fiber.h" +#include "global.h" +#include "init.h" +#include "readydeque.h" +#include "scheduler.h" + +#ifdef ENABLE_CILKRTS_PEDIGREE +extern __cilkrts_pedigree cilkrts_root_pedigree_node; +extern uint64_t DPRNG_PRIME; +extern uint64_t* dprng_m_array; +extern uint64_t dprng_m_X; + +uint64_t __cilkrts_dprng_swap_halves(uint64_t x); +uint64_t __cilkrts_dprng_mix(uint64_t x); +uint64_t __cilkrts_dprng_mix_mod_p(uint64_t x); +uint64_t __cilkrts_dprng_sum_mod_p(uint64_t a, uint64_t b); +void __cilkrts_init_dprng(void); + +uint64_t __cilkrts_get_dprand(void) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + __cilkrts_bump_worker_rank(); + return __cilkrts_dprng_mix_mod_p(w->current_stack_frame->dprng_dotproduct); +} + +#endif + +// Begin a Cilkified region. The routine runs on a Cilkifying thread to +// transfer the execution of this function to the workers in global_state g. +// This routine must be inlined for correctness. +static inline __attribute__((always_inline)) void +cilkify(global_state *g, __cilkrts_stack_frame *sf) { + // After inlining, orig_rsp will receive the stack pointer in the stack + // frame of the Cilk function instantiation on the Cilkifying thread. + void *orig_rsp = NULL; + ASM_GET_SP(orig_rsp); + +#ifdef ENABLE_CILKRTS_PEDIGREE + __cilkrts_init_dprng(); +#endif + + // After inlining, the setjmp saves the processor state, including the frame + // pointer, of the Cilk function. + if (__builtin_setjmp(sf->ctx) == 0) { + sysdep_save_fp_ctrl_state(sf); + invoke_cilkified_root(g, sf); + + wait_until_cilk_done(g); + + // At this point, some Cilk worker must have completed the Cilkified + // region and executed uncilkify at the end of the Cilk function. The + // longjmp will therefore jump to the end of the Cilk function. We need + // only restore the stack pointer to its original value on the + // Cilkifying thread's stack. + SP(sf) = orig_rsp; + sysdep_restore_fp_state(sf); + __builtin_longjmp(sf->ctx, 1); + } +} + +// End a Cilkified region. This routine runs on one worker in global_state g +// who finished executing the Cilkified region, in order to transfer control +// back to the original thread that began the Cilkified region. This routine +// must be inlined for correctness. +static inline __attribute__((always_inline)) void +uncilkify(global_state *g, __cilkrts_stack_frame *sf) { + // The setjmp will save the processor state at the end of the Cilkified + // region. The Cilkifying thread will longjmp to this point. + if (__builtin_setjmp(sf->ctx) == 0) { + sysdep_save_fp_ctrl_state(sf); + // Finish this Cilkified region, and transfer control back to the + // original thread that performed cilkify. + exit_cilkified_root(g, sf); + } +} + +#ifdef ENABLE_CILKRTS_PEDIGREE +__attribute__((always_inline)) __cilkrts_pedigree __cilkrts_get_pedigree(void) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (w == NULL) { + return cilkrts_root_pedigree_node; + } else { + __cilkrts_pedigree ret_ped; + ret_ped.parent = &(w->current_stack_frame->pedigree); + ret_ped.rank = w->current_stack_frame->rank; + return ret_ped; + } +} + +__attribute__((always_inline)) void __cilkrts_bump_worker_rank(void) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (w == NULL) { + cilkrts_root_pedigree_node.rank++; + } else { + w->current_stack_frame->rank++; + } + w->current_stack_frame->dprng_dotproduct = __cilkrts_dprng_sum_mod_p( + w->current_stack_frame->dprng_dotproduct, + dprng_m_array[w->current_stack_frame->dprng_depth]); +} +#endif + +// Enter a new Cilk function, i.e., a function that contains a cilk_spawn. This +// function must be inlined for correctness. +__attribute__((always_inline)) void +__cilkrts_enter_frame(__cilkrts_stack_frame *sf) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + sf->flags = 0; + if (NULL == w) { + cilkify(default_cilkrts, sf); + w = __cilkrts_get_tls_worker(); + } + cilkrts_alert(CFRAME, w, "__cilkrts_enter_frame %p", (void *)sf); + + sf->magic = frame_magic; + sf->call_parent = w->current_stack_frame; + atomic_store_explicit(&sf->worker, w, memory_order_relaxed); + w->current_stack_frame = sf; + // WHEN_CILK_DEBUG(sf->magic = CILK_STACKFRAME_MAGIC); + +#ifdef ENABLE_CILKRTS_PEDIGREE + // Pedigree maintenance. + if (sf->call_parent != NULL && !(sf->flags & CILK_FRAME_LAST)) { + sf->pedigree.rank = sf->call_parent->rank++; + sf->pedigree.parent = &(sf->call_parent->pedigree); + sf->dprng_depth = sf->call_parent->dprng_depth + 1; + sf->call_parent->dprng_dotproduct = __cilkrts_dprng_sum_mod_p( + sf->call_parent->dprng_dotproduct, + dprng_m_array[sf->call_parent->dprng_depth]); + sf->dprng_dotproduct = sf->call_parent->dprng_dotproduct; + } else { + sf->pedigree.rank = 0; + sf->pedigree.parent = NULL; + sf->dprng_depth = 0; + sf->dprng_dotproduct = dprng_m_X; + } + sf->rank = 0; +#endif +} + +// Enter a spawn helper, i.e., a fucntion containing code that was cilk_spawn'd. +// This function initializes worker and stack_frame structures. Because this +// routine will always be executed by a Cilk worker, it is optimized compared to +// its counterpart, __cilkrts_enter_frame. +__attribute__((always_inline)) void +__cilkrts_enter_frame_fast(__cilkrts_stack_frame *sf) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + cilkrts_alert(CFRAME, w, "__cilkrts_enter_frame_fast %p", (void *)sf); + + sf->flags = 0; + sf->magic = frame_magic; + sf->call_parent = w->current_stack_frame; + atomic_store_explicit(&sf->worker, w, memory_order_relaxed); + w->current_stack_frame = sf; + +#ifdef ENABLE_CILKRTS_PEDIGREE + // Pedigree maintenance. + if (sf->call_parent != NULL && !(sf->flags & CILK_FRAME_LAST)) { + sf->pedigree.rank = sf->call_parent->rank++; + sf->pedigree.parent = &(sf->call_parent->pedigree); + sf->dprng_depth = sf->call_parent->dprng_depth + 1; + sf->call_parent->dprng_dotproduct = __cilkrts_dprng_sum_mod_p( + sf->call_parent->dprng_dotproduct, + dprng_m_array[sf->call_parent->dprng_depth]); + sf->dprng_dotproduct = sf->call_parent->dprng_dotproduct; + } else { + sf->pedigree.rank = 0; + sf->pedigree.parent = NULL; + sf->dprng_depth = 0; + sf->dprng_dotproduct = dprng_m_X; + } + sf->rank = 0; +#endif +} + +// Detach the given Cilk stack frame, allowing other Cilk workers to steal the +// parent frame. +__attribute__((always_inline)) +void __cilkrts_detach(__cilkrts_stack_frame *sf) { + __cilkrts_worker *w = + atomic_load_explicit(&sf->worker, memory_order_relaxed); + cilkrts_alert(CFRAME, w, "__cilkrts_detach %p", (void *)sf); + + CILK_ASSERT(w, CHECK_CILK_FRAME_MAGIC(w->g, sf)); + CILK_ASSERT(w, sf->worker == __cilkrts_get_tls_worker()); + CILK_ASSERT(w, w->current_stack_frame == sf); + + struct __cilkrts_stack_frame *parent = sf->call_parent; + sf->flags |= CILK_FRAME_DETACHED; + struct __cilkrts_stack_frame **tail = + atomic_load_explicit(&w->tail, memory_order_relaxed); + CILK_ASSERT(w, (tail + 1) < w->ltq_limit); + + // store parent at *tail, and then increment tail + *tail++ = parent; + /* Release ordering ensures the two preceding stores are visible. */ + atomic_store_explicit(&w->tail, tail, memory_order_release); +} + +// inlined by the compiler; this implementation is only used in invoke-main.c +__attribute__((always_inline)) +void __cilkrts_save_fp_ctrl_state(__cilkrts_stack_frame *sf) { + sysdep_save_fp_ctrl_state(sf); +} + +// Pop this Cilk stack frame off of the bottom of the linked list of +// __cilkrts_stack_frames, and if popping the last Cilk stack frame, call +// uncilkify to terminate the Cilkified region. This function must be inlined +// for correctness. +__attribute__((always_inline)) +void __cilkrts_pop_frame(__cilkrts_stack_frame *sf) { + __cilkrts_worker *w = + atomic_load_explicit(&sf->worker, memory_order_relaxed); + cilkrts_alert(CFRAME, w, "__cilkrts_pop_frame %p", (void *)sf); + + CILK_ASSERT(w, CHECK_CILK_FRAME_MAGIC(w->g, sf)); + CILK_ASSERT(w, sf->worker == __cilkrts_get_tls_worker()); + /* The inlined version in the Tapir compiler uses release + semantics for the store to call_parent, but relaxed + order may be acceptable for both. A thief can't see + these operations until the Dekker protocol with a + memory barrier has run. */ + w->current_stack_frame = sf->call_parent; + sf->call_parent = NULL; + // Check if sf is the final stack frame, and if so, terminate the Cilkified + // region. + if (sf->flags & CILK_FRAME_LAST) { + uncilkify(w->g, sf); + } +} diff --git a/runtime/cilkred_map.c b/runtime/cilkred_map.c index 202a25ca..902df357 100644 --- a/runtime/cilkred_map.c +++ b/runtime/cilkred_map.c @@ -1,5 +1,7 @@ #include "cilkred_map.h" +#include + // ================================================================= // small helper functions // ================================================================= @@ -99,22 +101,20 @@ ViewInfo *cilkred_map_lookup(cilkred_map *this_map, cilkred_map *cilkred_map_make_map(__cilkrts_worker *w, size_t size) { CILK_ASSERT_G(w); CILK_ASSERT(w, size > 0 && (hyper_id_t)size == size); - cilkrts_alert(ALERT_REDUCE, w, - "(cilkred_map_make_map) creating a cilkred_map size %u", - (unsigned int)size); - cilkred_map *h = (cilkred_map *)malloc(sizeof(*h)); + cilkred_map *h = + (cilkred_map *)cilk_internal_malloc(w, sizeof(*h), IM_REDUCER_MAP); // MAK: w is not NULL h->spa_cap = size; h->num_of_vinfo = 0; h->num_of_logs = 0; + h->merging = false; h->vinfo = (ViewInfo *)calloc(size, sizeof(ViewInfo)); h->log = (hyper_id_t *)calloc(size / 2, sizeof(hyper_id_t)); - h->merging = false; - cilkrts_alert(ALERT_REDUCE, w, - "(cilkred_map_make_map) created cilkred_map %p", h); + cilkrts_alert(REDUCE, w, "created reducer map size %zu %p", size, + (void *)h); return h; } @@ -130,23 +130,24 @@ void cilkred_map_destroy_map(__cilkrts_worker *w, cilkred_map *h) { if (!h) { return; } - cilkrts_alert(ALERT_REDUCE, w, - "(cilkred_map_destroy_map) freeing cilkred_map %p", h); + if (DEBUG_ENABLED(REDUCER)) { + for (hyper_id_t i = 0; i < h->spa_cap; ++i) + CILK_ASSERT(w, !h->vinfo[i].val); + } free(h->vinfo); h->vinfo = NULL; free(h->log); h->log = NULL; - free(h); + cilk_internal_free(w, h, sizeof(*h), IM_REDUCER_MAP); - cilkrts_alert(ALERT_REDUCE, w, - "(cilkred_map_destroy_map) freed cilkred_map %p\n", h); + cilkrts_alert(REDUCE, w, "freed reducer map %p", (void *)h); } +/* This function is responsible for freeing other_map. */ void cilkred_map_merge(cilkred_map *this_map, __cilkrts_worker *w, cilkred_map *other_map, merge_kind kind) { - cilkrts_alert(ALERT_REDUCE, w, - "(cilkred_map_merge) merging %p into %p, order %d", other_map, - this_map, kind); + cilkrts_alert(REDUCE, w, "merging reducer map %p into %p, order %d", + (void *)other_map, (void *)this_map, kind); // Remember the current stack frame. // __cilkrts_stack_frame *current_sf = w->current_stack_frame; this_map->merging = true; @@ -157,8 +158,10 @@ void cilkred_map_merge(cilkred_map *this_map, __cilkrts_worker *w, // CILK_ASSERT(w, !other_map->is_leftmost /* || kind == MERGE_UNORDERED */); // bool merge_to_leftmost = (this_map->is_leftmost); - if (other_map->num_of_vinfo == 0) - return; // A no-op + if (other_map->num_of_vinfo == 0) { + cilkred_map_destroy_map(w, other_map); + return; + } if (other_map->num_of_logs <= (other_map->spa_cap / 2)) { hyper_id_t i; @@ -214,7 +217,7 @@ void cilkred_map_merge(cilkred_map *this_map, __cilkrts_worker *w, // this_map->is_leftmost = this_map->is_leftmost || other_map->is_leftmost; this_map->merging = false; other_map->merging = false; - // cilkred_map_destroy_map(w, other_map); + cilkred_map_destroy_map(w, other_map); return; } diff --git a/runtime/cilkred_map.h b/runtime/cilkred_map.h index 94041480..2d0d5717 100644 --- a/runtime/cilkred_map.h +++ b/runtime/cilkred_map.h @@ -75,6 +75,9 @@ cilkred_map *cilkred_map_make_map(__cilkrts_worker *w, size_t size); CHEETAH_INTERNAL void cilkred_map_destroy_map(__cilkrts_worker *w, cilkred_map *h); +/** + * Merge other_map into this_map and destroy other_map. + */ CHEETAH_INTERNAL void cilkred_map_merge(cilkred_map *this_map, __cilkrts_worker *w, cilkred_map *other_map, merge_kind kind); diff --git a/runtime/closure.c b/runtime/closure.c index 03ddd40b..41085837 100644 --- a/runtime/closure.c +++ b/runtime/closure.c @@ -29,6 +29,25 @@ void Closure_checkmagic(__cilkrts_worker *const w, Closure *t) { } } +const char *Closure_status_to_str(enum ClosureStatus status) { + switch (status) { + case CLOSURE_RUNNING: + return "running"; + case CLOSURE_SUSPENDED: + return "suspended"; + case CLOSURE_RETURNING: + return "returning"; + case CLOSURE_READY: + return "ready"; + case CLOSURE_PRE_INVALID: + return "pre-invalid"; + case CLOSURE_POST_INVALID: + return "post-invalid"; + default: + return "unknown"; + } +} + void clear_closure_exception(struct closure_exception *exn) { exn->exn = NULL; } void Closure_change_status(__cilkrts_worker *const w, Closure *t, @@ -98,8 +117,8 @@ static inline void Closure_init(Closure *t) { t->status = CLOSURE_PRE_INVALID; t->lock_wait = false; t->has_cilk_callee = false; - t->join_counter = 0; t->simulated_stolen = false; + t->join_counter = 0; t->frame = NULL; t->fiber = NULL; @@ -124,6 +143,7 @@ static inline void Closure_init(Closure *t) { clear_closure_exception(&(t->user_exn)); t->reraise_cfa = NULL; t->parent_rsp = NULL; + t->saved_throwing_fiber = NULL; atomic_store_explicit(&t->child_rmap, NULL, memory_order_relaxed); atomic_store_explicit(&t->right_rmap, NULL, memory_order_relaxed); @@ -132,11 +152,14 @@ static inline void Closure_init(Closure *t) { Closure *Closure_create(__cilkrts_worker *const w) { /* cilk_internal_malloc returns sufficiently aligned memory */ - Closure *new_closure = cilk_internal_malloc(w, sizeof(*new_closure)); + Closure *new_closure = + cilk_internal_malloc(w, sizeof(*new_closure), IM_CLOSURE); CILK_ASSERT(w, new_closure != NULL); Closure_init(new_closure); + cilkrts_alert(CLOSURE, w, "Allocate closure %p", (void *)new_closure); + return new_closure; } @@ -305,7 +328,7 @@ void Closure_suspend_victim(__cilkrts_worker *thief, __cilkrts_worker *victim, Closure_assert_ownership(thief, cl); deque_assert_ownership(thief, victim->self); - CILK_ASSERT(thief, cl == thief->g->invoke_main || cl->spawn_parent || + CILK_ASSERT(thief, cl == thief->g->root_closure || cl->spawn_parent || cl->call_parent); Closure_change_status(thief, cl, CLOSURE_RUNNING, CLOSURE_SUSPENDED); @@ -321,14 +344,14 @@ void Closure_suspend(__cilkrts_worker *const w, Closure *cl) { CILK_ASSERT(w, !cl->user_rmap); - cilkrts_alert(ALERT_SCHED, w, "Closure_suspend %p", cl); + cilkrts_alert(SCHED, w, "Closure_suspend %p", (void *)cl); Closure_checkmagic(w, cl); Closure_assert_ownership(w, cl); deque_assert_ownership(w, w->self); - CILK_ASSERT(w, - cl == w->g->invoke_main || cl->spawn_parent || cl->call_parent); + CILK_ASSERT(w, cl == w->g->root_closure || cl->spawn_parent || + cl->call_parent); CILK_ASSERT(w, cl->frame != NULL); CILK_ASSERT(w, __cilkrts_stolen(cl->frame)); CILK_ASSERT(w, cl->frame->worker->self == w->self); @@ -380,8 +403,19 @@ void Closure_destroy_main(Closure *t) { * pool) */ void Closure_destroy(struct __cilkrts_worker *const w, Closure *t) { + cilkrts_alert(CLOSURE, w, "Deallocate closure %p", (void *)t); Closure_checkmagic(w, t); t->status = CLOSURE_POST_INVALID; Closure_clean(w, t); - cilk_internal_free(w, t, sizeof(*t)); + cilk_internal_free(w, t, sizeof(*t), IM_CLOSURE); +} + +/* Destroy the closure and internally free it (put back to global pool), after + workers have been terminated. + */ +void Closure_destroy_global(struct global_state *const g, Closure *t) { + cilkrts_alert(CLOSURE, NULL, "Deallocate closure %p", (void *)t); + t->status = CLOSURE_POST_INVALID; + Closure_clean(NULL, t); + cilk_internal_free_global(g, t, sizeof(*t), IM_CLOSURE); } diff --git a/runtime/closure.h b/runtime/closure.h index 7dbb2b99..9ea14dcb 100644 --- a/runtime/closure.h +++ b/runtime/closure.h @@ -22,17 +22,7 @@ enum ClosureStatus { CLOSURE_POST_INVALID /* after destruction */ }; -static inline const char *Closure_status_to_str(enum ClosureStatus status) { - switch(status) { - case CLOSURE_RUNNING: return "running"; - case CLOSURE_SUSPENDED: return "suspended"; - case CLOSURE_RETURNING: return "returning"; - case CLOSURE_READY: return "ready"; - case CLOSURE_PRE_INVALID: return "pre-invalid"; - case CLOSURE_POST_INVALID: return "post-invalid"; - default: return "unknown"; - } -} +CHEETAH_INTERNAL const char *Closure_status_to_str(enum ClosureStatus status); #if CILK_DEBUG #define Closure_assert_ownership(w, t) Closure_assert_ownership(w, t) @@ -68,14 +58,11 @@ struct Closure { worker_id owner_ready_deque; /* debug only */ worker_id mutex_owner; /* debug only */ - enum ClosureStatus status : 16; /* doubles as magic number */ - bool lock_wait; + enum ClosureStatus status : 8; /* doubles as magic number */ bool has_cilk_callee; + bool lock_wait; + bool simulated_stolen; unsigned int join_counter; /* number of outstanding spawned children */ - bool simulated_stolen; /* ANGE XXX: Sorry, I probably messed - up the alignment; should we update join_counter - to be a short instead? */ - char *orig_rsp; /* the rsp one should use when sync successfully */ Closure *callee; @@ -119,6 +106,7 @@ struct Closure { char *reraise_cfa; char *parent_rsp; + struct cilk_fiber *saved_throwing_fiber; // cilkred_map *children_reducer_map; // cilkred_map *right_reducer_map; @@ -189,4 +177,6 @@ CHEETAH_INTERNAL void Closure_suspend(__cilkrts_worker *const w, Closure *cl); CHEETAH_INTERNAL void Closure_make_ready(Closure *cl); CHEETAH_INTERNAL void Closure_destroy(__cilkrts_worker *const w, Closure *t); CHEETAH_INTERNAL void Closure_destroy_main(Closure *t); +CHEETAH_INTERNAL void Closure_destroy_global(struct global_state *const g, + Closure *t); #endif diff --git a/runtime/cmdline.c b/runtime/cmdline.c deleted file mode 100644 index c882cc1f..00000000 --- a/runtime/cmdline.c +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include - -#include "cilk-internal.h" -#include "cmdline.h" -#include "debug.h" -#include "global.h" - -enum { - NONE, - NPROC, - DEQ_DEPTH, - STACK_SIZE, - FIBER_POOL_CAP, - VERSION, - HELP, - END_OPTIONS -}; - -static const char *option_prefix = "cheetah-"; - -// TODO: Incorporate option_prefix into all of the optarray help -// entries in place of 'cheetah-'. -CHEETAH_INTERNAL -static struct options { - const char *string; - int option; - const char *help; -} optarray[] = { - {"", END_OPTIONS, "--cheetah- : end of option parsing"}, - {"nproc", NPROC, "--cheetah-nproc : set number of processors"}, - {"deqdepth", DEQ_DEPTH, - "--cheetah-deqdepth : set number of entries per deque"}, - {"stacksize", STACK_SIZE, - "--cheetah-stacksize : set the size of a fiber"}, - {"fiber-pool", FIBER_POOL_CAP, - "--cheetah-fiber-pool : set the per-worker fiber pool capacity"}, - {"version", VERSION, "--cheetah-version: print version of the runtime"}, - {"help", HELP, "--cheetah-help : print this message"}, - {(char *)0, NONE, ""}}; - -static void print_help(void) { - struct options *p; - fprintf(stderr, "cheetah runtime options:\n"); - for (p = optarray + 1; p->string; ++p) - if (p->help) - fprintf(stderr, " %s\n", p->help); - fprintf(stderr, "\n"); -} - -static void print_version(void) { - int debug = 0, stats = 0; - WHEN_CILK_DEBUG(debug = 1); -#if CILK_STATS - stats = 1; -#endif - fprintf(stderr, "version %d.%d\n", __CILKRTS_VERSION, - __CILKRTS_ABI_VERSION); - fprintf(stderr, "compilation options: "); - if (debug) - fprintf(stderr, "CILK_DEBUG "); - if (stats) - fprintf(stderr, "CILK_STATS "); - if (!(debug | stats)) - fprintf(stderr, "none"); - fprintf(stderr, "\n"); -} - -/* look for a given string in the option table */ -static struct options *parse_option(char *s) { - struct options *p; - for (p = optarray; p->string; ++p) - if (strncmp(s, p->string, strlen(p->string) + 1) == 0) - break; - return p; -} - -static unsigned long parse_unsigned(const char *s, unsigned long min, - unsigned long max) { - unsigned long val = strtoul(s, 0, 0); - if (val < min) - return min; - if (val > max) - return max; - return val; -} - -#define CHECK(cond, complaint) \ - if (!(cond)) { \ - fprintf(stderr, "Bad option argument for -%s: %s\n", p->string, \ - complaint); \ - return 1; \ - } - -CHEETAH_INTERNAL int parse_command_line(struct rts_options *options, int *argc, - char *argv[]) { - struct options *p; - /* gcc allows to write directly into *options, but other compilers - * only allow you to initialize this way. - */ - struct rts_options default_options = DEFAULT_OPTIONS; - - /* default options */ - *options = default_options; - - int j = 1; - for (int i = 1; i < *argc; ++i) { - if (argv[i][0] == '-' && argv[i][1] == '-' && - strncmp(argv[i] + 2, option_prefix, strlen(option_prefix)) == 0) { - p = parse_option(argv[i] + 2 + strlen(option_prefix)); - - switch (p->option) { - case NPROC: - ++i; - CHECK(i < *argc, "argument missing"); - options->nproc = parse_unsigned(argv[i], 0, 9999); - break; - - case DEQ_DEPTH: - ++i; - CHECK(i < *argc, "argument missing"); - options->deqdepth = parse_unsigned(argv[i], 1, 99999); - break; - - case STACK_SIZE: - ++i; - CHECK(i < *argc, "argument missing"); - options->stacksize = - parse_unsigned(argv[i], 16384, 100 * 1024 * 1024); - break; - - case VERSION: - print_version(); - return 1; - break; - - case HELP: - print_help(); - return 1; - break; - - case FIBER_POOL_CAP: - ++i; - CHECK(i < *argc, "argument missing"); - options->fiber_pool_cap = parse_unsigned(argv[i], 8, 999999); - break; - - default: - fprintf(stderr, "Unrecognized options.\n"); - print_help(); - return 1; - break; - } - } else { - assert(j <= i); - argv[j++] = argv[i]; // keep it - } - } - *argc = j; - - return 0; -} diff --git a/runtime/cmdline.h b/runtime/cmdline.h deleted file mode 100644 index 2d1e4ac0..00000000 --- a/runtime/cmdline.h +++ /dev/null @@ -1,6 +0,0 @@ -#include "rts-config.h" - -struct rts_options; - -CHEETAH_INTERNAL int parse_command_line(struct rts_options *options, int *argc, - char *argv[]); diff --git a/runtime/debug.c b/runtime/debug.c index 413904fc..623fb867 100644 --- a/runtime/debug.c +++ b/runtime/debug.c @@ -6,65 +6,129 @@ #include #include #include +#include -CHEETAH_INTERNAL unsigned int alert_level = ALERT_LVL; +CHEETAH_INTERNAL unsigned int alert_level = 0; +CHEETAH_INTERNAL unsigned int debug_level = 0; + +/* To reduce overhead of logging messages are accumulated into memory + and written to stderr in batches of about 5,000 bytes. */ +static size_t alert_log_size = 0, alert_log_offset = 0; +static char *alert_log = NULL; + +void set_alert_level(unsigned int level) { + alert_level = level; + if (level == 0) { + flush_alert_log(); + return; + } + if (level & 0x80000000) { + return; + } + if (alert_log == NULL) { + alert_log_size = 5000; + alert_log = malloc(alert_log_size); + if (alert_log) { + memset(alert_log, ' ', alert_log_size); + } + } +} + +void set_debug_level(unsigned int level) { debug_level = level; } const char *const __cilkrts_assertion_failed = "%s:%d: cilk assertion failed: %s\n"; -void cilk_die_internal(struct global_state *const g, const char *complain) { +void cilk_die_internal(struct global_state *const g, const char *fmt, ...) { + fflush(stdout); + va_list l; + va_start(l, fmt); cilk_mutex_lock(&(g->print_lock)); - fprintf(stderr, "Fatal error: %s\n", complain); + flush_alert_log(); + fprintf(stderr, "Fatal error: "); + vfprintf(stderr, fmt, l); + fputc('\n', stderr); + fflush(stderr); cilk_mutex_unlock(&(g->print_lock)); exit(1); } CHEETAH_INTERNAL_NORETURN void cilkrts_bug(__cilkrts_worker *w, const char *fmt, ...) { + fflush(NULL); if (w) { + cilk_mutex_lock(&(w->g->print_lock)); + flush_alert_log(); + cilk_mutex_unlock(&(w->g->print_lock)); fprintf(stderr, "[W%02u]: ", w->self); } - - /* To reduce user confusion, write all user-generated output - before the system-generated error message. */ + /* Without a worker there is no safe way to flush the log */ va_list l; - fflush(NULL); va_start(l, fmt); vfprintf(stderr, fmt, l); va_end(l); fputc('\n', stderr); fflush(stderr); - abort(); // generate core file } -#if CILK_DEBUG +void flush_alert_log() { + if (ALERT_LVL == 0) + return; + if (alert_log == NULL) { + return; + } + if (alert_log_offset > 0) { + fflush(stdout); + fwrite(alert_log, 1, alert_log_offset, stderr); + alert_log_offset = 0; + } + alert_log_size = 0; + free(alert_log); + alert_log = NULL; +} + #undef cilkrts_alert +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; + CHEETAH_INTERNAL void cilkrts_alert(const int lvl, __cilkrts_worker *w, const char *fmt, ...) { - /* To reduce user confusion, write all user-generated output - before the system-generated error message. */ -#ifndef ALERT_LVL + if (ALERT_LVL == 0) + return; + char prefix[10], body[200]; + size_t size1 = 0, size2 = 0; if (w) { - fprintf(stderr, "[W%02u]: ", w->self); + size1 = snprintf(prefix, sizeof prefix, "[W%02u]: ", w->self); + assert(size1 >= 7 && size1 < 10); } - va_list l; - va_start(l, fmt); - vfprintf(stderr, fmt, l); - va_end(l); - fputc('\n', stderr); -#else - if (lvl & ALERT_LVL) { - if (w) { - fprintf(stderr, "[W%02u]: ", w->self); - } + { va_list l; va_start(l, fmt); - vfprintf(stderr, fmt, l); + int tmp = vsnprintf(body, sizeof body, fmt, l); + assert(tmp >= 0); + size2 = tmp; + if (size2 > sizeof body - 1) + size2 = sizeof body - 1; va_end(l); - fputc('\n', stderr); } -#endif + + pthread_mutex_lock(&lock); + if (alert_log) { + if (alert_log_offset + size1 + size2 + 1 >= alert_log_size) { + fwrite(alert_log, 1, alert_log_offset, stderr); + memset(alert_log, ' ', alert_log_offset); + alert_log_offset = 0; + } + memcpy(alert_log + alert_log_offset, prefix, size1); + memcpy(alert_log + alert_log_offset + size1, body, size2); + alert_log[alert_log_offset + size1 + size2] = '\n'; + alert_log_offset += size1 + size2 + 1; + } else { + if (w) + fprintf(stderr, "%s%s\n", prefix, body); + else + fprintf(stderr, "%s\n", body); + } + pthread_mutex_unlock(&lock); } -#endif diff --git a/runtime/debug.h b/runtime/debug.h index 035c0710..6edf6c59 100644 --- a/runtime/debug.h +++ b/runtime/debug.h @@ -9,41 +9,68 @@ struct global_state; struct __cilkrts_worker; -#define CILK_CHECK(g, cond, complain) \ - ((cond) ? (void)0 : cilk_die_internal(g, complain)) +#define CILK_CHECK(g, cond, complain, ...) \ + ((cond) ? (void)0 : cilk_die_internal(g, complain, __VA_ARGS__)) #ifndef ALERT_LVL -#define ALERT_LVL 0 +#define ALERT_LVL 0x3d03 #endif #define ALERT_NONE 0x0 -#define ALERT_FIBER 0x1 -#define ALERT_SYNC 0x2 -#define ALERT_SCHED 0x4 -#define ALERT_STEAL 0x8 -#define ALERT_EXCEPT 0x10 -#define ALERT_RETURN 0x20 -#define ALERT_BOOT 0x40 -#define ALERT_CFRAME 0x80 -#define ALERT_REDUCE 0x100 -#define ALERT_START 0x200 -#define ALERT_REDUCE_ID 0x400 +#define ALERT_FIBER 0x001 +#define ALERT_FIBER_SUMMARY 0x002 +#define ALERT_MEMORY 0x004 +#define ALERT_SYNC 0x010 +#define ALERT_SCHED 0x020 +#define ALERT_STEAL 0x040 +#define ALERT_RETURN 0x080 +#define ALERT_EXCEPT 0x100 +#define ALERT_CFRAME 0x200 +#define ALERT_REDUCE 0x400 +#define ALERT_REDUCE_ID 0x800 +#define ALERT_BOOT 0x1000 +#define ALERT_START 0x2000 +#define ALERT_CLOSURE 0x4000 extern CHEETAH_INTERNAL unsigned int alert_level; +#define ALERT_ENABLED(flag) (alert_level & (ALERT_LVL & ALERT_##flag)) + +#ifndef DEBUG_LVL +#define DEBUG_LVL 0xff +#endif + +#define DEBUG_MEMORY 0x01 +#define DEBUG_MEMORY_SLOW 0x02 +#define DEBUG_FIBER 0x04 +#define DEBUG_REDUCER 0x08 +extern CHEETAH_INTERNAL unsigned int debug_level; +#define DEBUG_ENABLED(flag) (debug_level & (DEBUG_LVL & DEBUG_##flag)) +#define DEBUG_ENABLED_STATIC(flag) (DEBUG_LVL & DEBUG_##flag) // Unused: compiler inlines the stack frame creation // #define CILK_STACKFRAME_MAGIC 0xCAFEBABE +CHEETAH_INTERNAL void set_alert_level(unsigned int); +CHEETAH_INTERNAL void set_debug_level(unsigned int); +CHEETAH_INTERNAL void flush_alert_log(); + +__attribute__((__format__(__printf__, 2, 3))) CHEETAH_INTERNAL_NORETURN void +cilkrts_bug(struct __cilkrts_worker *w, const char *fmt, ...); CHEETAH_INTERNAL_NORETURN -void cilkrts_bug(struct __cilkrts_worker *w, const char *fmt, ...); -CHEETAH_INTERNAL_NORETURN -void cilk_die_internal(struct global_state *const g, const char *complain); +void cilk_die_internal(struct global_state *const g, const char *fmt, ...); + +#if ALERT_LVL != 0 +__attribute__((__format__(__printf__, 3, 4))) void +cilkrts_alert(int lvl, struct __cilkrts_worker *w, const char *fmt, ...); +#pragma GCC diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +#define cilkrts_alert(CODE, W, FMT, ...) \ + (alert_level & ((ALERT_##CODE) & ALERT_LVL)) \ + ? cilkrts_alert(ALERT_##CODE, W, FMT, ##__VA_ARGS__) \ + : (void)0 +#else +#define cilkrts_alert(lvl, fmt, ...) +#endif #if CILK_DEBUG -void cilkrts_alert(int lvl, struct __cilkrts_worker *w, const char *fmt, ...); -#define cilkrts_alert(LVL, W, FMT, ...) \ - (alert_level & (LVL)&ALERT_LVL) \ - ? cilkrts_alert(LVL, W, FMT, ##__VA_ARGS__) \ - : (void)0 #define WHEN_CILK_DEBUG(ex) ex @@ -56,12 +83,40 @@ CHEETAH_INTERNAL extern const char *const __cilkrts_assertion_failed; : cilkrts_bug(w, __cilkrts_assertion_failed, __FILE__, __LINE__, \ #ex)) +#define CILK_ASSERT_POINTER_EQUAL(w, P1, P2) \ + ({ void *_t1 = (P1), *_t2 = (P2); __builtin_expect(_t1 == _t2, 1) \ + ? (void)0 \ + : cilkrts_bug(w, "%s: %d: cilk_assertion failed: %s (%p) == %s (%p)", \ + __FILE__, __LINE__, #P1, _t1, #P2, _t2);}) + +#define CILK_ASSERT_ZERO(w, ex, FMT) \ + (__builtin_expect(!(ex), 1) \ + ? (void)0 \ + : cilkrts_bug(w, "%s: %d: cilk_assertion failed: %s (" FMT ") == 0", \ + __FILE__, __LINE__, #ex, ex)) + +#define CILK_ASSERT_INDEX_ZERO(w, LEFT, I, RIGHT, FMT) \ + (__builtin_expect(!(LEFT[I] RIGHT), 1) \ + ? (void)0 \ + : cilkrts_bug(w, \ + "%s: %d: cilk_assertion failed: %s[%u]%s = " FMT \ + " should be 0", \ + __FILE__, __LINE__, #LEFT, I, #RIGHT, LEFT[I] RIGHT)) + #define CILK_ASSERT_G(ex) \ (__builtin_expect((ex) != 0, 1) \ ? (void)0 \ : cilkrts_bug(NULL, __cilkrts_assertion_failed, __FILE__, __LINE__, \ #ex)) +#define CILK_ASSERT_G_LE(A, B, FMT) \ + (__builtin_expect(((A) <= (B)) != 0, 1) \ + ? (void)0 \ + : cilkrts_bug(NULL, \ + "%s: %d: cilk assertion failed: %s (" FMT \ + ") <= %s " FMT ")", \ + __FILE__, __LINE__, #A, A, #B, B)) + #define CILK_ABORT(w, msg) \ cilkrts_bug(w, __cilkrts_assertion_failed, __FILE__, __LINE__, msg) @@ -69,7 +124,6 @@ CHEETAH_INTERNAL extern const char *const __cilkrts_assertion_failed; cilkrts_bug(NULL, __cilkrts_assertion_failed_g, __FILE__, __LINE__, msg) #else -#define cilkrts_alert(lvl, fmt, ...) #define CILK_ASSERT(w, ex) #define CILK_ASSERT_G(ex) #define CILK_ABORT(w, msg) diff --git a/runtime/fiber-pool.c b/runtime/fiber-pool.c index 6fe8055a..a9dedeed 100644 --- a/runtime/fiber-pool.c +++ b/runtime/fiber-pool.c @@ -1,3 +1,4 @@ +#include /* PRIu32 */ #include #include @@ -5,6 +6,7 @@ #include "debug.h" #include "fiber.h" #include "global.h" +#include "local.h" #include "mutex.h" // Whent the pool becomes full (empty), free (allocate) this fraction @@ -35,46 +37,28 @@ // Private helper functions for maintaining pool stats //========================================================= -#if FIBER_STATS static void fiber_pool_stat_init(struct cilk_fiber_pool *pool) { pool->stats.in_use = 0; pool->stats.max_in_use = 0; pool->stats.max_free = 0; } -static void fiber_pool_stat_print(struct global_state *g) { +#define POOL_FMT "size %3u, %4d used %4d max used %4u max free" -#define HDR_DESC "%15s" -#define WORKER_HDR_DESC "%10s %3u:" // two char short compared to HDR_DESC -#define FIELD_STR_DESC "%10s" -#define FIELD_DESC "%10d" - - fprintf(stderr, "\nFIBER POOL STATS\n"); - fprintf(stderr, HDR_DESC, "Fiber stats:"); - fprintf(stderr, - FIELD_STR_DESC FIELD_STR_DESC FIELD_STR_DESC FIELD_STR_DESC "\n", - "in-use", "max in-use", "curr-free", "max-free"); - fprintf(stderr, "-------------------------------------------" - "-----------------------------\n"); - fprintf(stderr, HDR_DESC, "Global:"); - fprintf(stderr, FIELD_DESC FIELD_DESC FIELD_DESC FIELD_DESC "\n", - g->fiber_pool.stats.in_use, g->fiber_pool.stats.max_in_use, - g->fiber_pool.size, g->fiber_pool.stats.max_free); - - for (unsigned int i = 0; i < g->options.nproc; i++) { - __cilkrts_worker *w = g->workers[i]; - fprintf(stderr, WORKER_HDR_DESC, "Worker", w->self); - fprintf(stderr, FIELD_DESC FIELD_DESC FIELD_DESC FIELD_DESC "\n", - w->l->fiber_pool.stats.in_use, - w->l->fiber_pool.stats.max_in_use, w->l->fiber_pool.size, - w->l->fiber_pool.stats.max_free); - } +static void fiber_pool_stat_print_worker(__cilkrts_worker *w, void *data) { + FILE *fp = (FILE *)data; + fprintf(fp, "[W%02" PRIu32 "] " POOL_FMT "\n", w->self, + w->l->fiber_pool.size, w->l->fiber_pool.stats.in_use, + w->l->fiber_pool.stats.max_in_use, w->l->fiber_pool.stats.max_free); +} + +static void fiber_pool_stat_print(struct global_state *g) { + fprintf(stderr, "\nFIBER POOL STATS\n[G ] " POOL_FMT "\n", + g->fiber_pool.size, g->fiber_pool.stats.in_use, + g->fiber_pool.stats.max_in_use, g->fiber_pool.stats.max_free); + for_each_worker(g, &fiber_pool_stat_print_worker, stderr); fprintf(stderr, "\n"); } -#else -#define fiber_pool_stat_init(pool) -#define fiber_pool_stat_print(g) -#endif // FIBER_STATS //========================================================= // Private helper functions @@ -92,13 +76,9 @@ static void fiber_pool_free_batch(__cilkrts_worker *w, static void fiber_pool_init(struct cilk_fiber_pool *pool, size_t stacksize, unsigned int bufsize, struct cilk_fiber_pool *parent, int is_shared) { - if (is_shared) { - pool->lock = malloc(sizeof(*pool->lock)); - cilk_mutex_init(pool->lock); - pool->mutex_owner = NO_WORKER; - } else { - pool->lock = NULL; - } + cilk_mutex_init(&pool->lock); + pool->mutex_owner = NO_WORKER; + pool->shared = is_shared; pool->stack_size = stacksize; pool->parent = parent; pool->capacity = bufsize; @@ -108,13 +88,8 @@ static void fiber_pool_init(struct cilk_fiber_pool *pool, size_t stacksize, /* Helper function for destroying fiber pool */ static void fiber_pool_destroy(struct cilk_fiber_pool *pool) { - - if (pool->lock) { - CILK_ASSERT_G(pool->mutex_owner == NO_WORKER); - cilk_mutex_destroy(pool->lock); - free(pool->lock); - pool->lock = NULL; - } + CILK_ASSERT_G(pool->size == 0); + cilk_mutex_destroy(&pool->lock); free(pool->fibers); pool->parent = NULL; pool->fibers = NULL; @@ -122,29 +97,31 @@ static void fiber_pool_destroy(struct cilk_fiber_pool *pool) { static inline void fiber_pool_assert_ownership(__cilkrts_worker *w, struct cilk_fiber_pool *pool) { - CILK_ASSERT(w, pool->lock == NULL || pool->mutex_owner == w->self); + if (pool->shared) + CILK_ASSERT(w, pool->mutex_owner == w->self); } static inline void fiber_pool_assert_alienation(__cilkrts_worker *w, struct cilk_fiber_pool *pool) { - CILK_ASSERT(w, pool->lock == NULL || pool->mutex_owner != w->self); + if (pool->shared) + CILK_ASSERT(w, pool->mutex_owner != w->self); } static inline void fiber_pool_lock(__cilkrts_worker *w, struct cilk_fiber_pool *pool) { - if (pool->lock) { + if (pool->shared) { fiber_pool_assert_alienation(w, pool); - cilk_mutex_lock(pool->lock); + cilk_mutex_lock(&pool->lock); pool->mutex_owner = w->self; } } static inline void fiber_pool_unlock(__cilkrts_worker *w, struct cilk_fiber_pool *pool) { - if (pool->lock) { + if (pool->shared) { fiber_pool_assert_ownership(w, pool); pool->mutex_owner = NO_WORKER; - cilk_mutex_unlock(pool->lock); + cilk_mutex_unlock(&pool->lock); } } @@ -158,8 +135,13 @@ static void fiber_pool_increase_capacity(__cilkrts_worker *w, unsigned int new_size) { fiber_pool_assert_ownership(w, pool); + if (pool->capacity < new_size) { - pool->fibers = realloc(pool->fibers, new_size * sizeof(*pool->fibers)); + struct cilk_fiber **larger = + realloc(pool->fibers, new_size * sizeof(*pool->fibers)); + if (!larger) + CILK_ABORT(w, "out of fiber memory"); + pool->fibers = larger; pool->capacity = new_size; } } @@ -182,9 +164,12 @@ fiber_pool_decrease_capacity(__cilkrts_worker *w, struct cilk_fiber_pool *pool, CILK_ASSERT(w, pool->size == new_size); } if (pool->capacity > new_size) { - pool->fibers = (struct cilk_fiber **)realloc( + struct cilk_fiber **smaller = (struct cilk_fiber **)realloc( pool->fibers, new_size * sizeof(struct cilk_fiber *)); - pool->capacity = new_size; + if (smaller) { + pool->fibers = smaller; + pool->capacity = new_size; + } } } @@ -207,25 +192,22 @@ static void fiber_pool_allocate_batch(__cilkrts_worker *w, for (unsigned int i = 0; i < from_parent; i++) { pool->fibers[pool->size++] = parent->fibers[--parent->size]; } -#if FIBER_STATS // update parent pool stats before releasing the lock on it parent->stats.in_use += from_parent; if (parent->stats.in_use > parent->stats.max_in_use) { parent->stats.max_in_use = parent->stats.in_use; } -#endif fiber_pool_unlock(w, parent); } - if (batch_size - from_parent) { // if we need more still + if (batch_size > from_parent) { // if we need more still for (unsigned int i = from_parent; i < batch_size; i++) { - pool->fibers[pool->size++] = cilk_fiber_allocate(w); + pool->fibers[pool->size++] = + cilk_fiber_allocate(w, pool->stack_size); } } -#if FIBER_STATS if (pool->size > pool->stats.max_free) { pool->stats.max_free = pool->size; } -#endif } /** @@ -251,12 +233,10 @@ static void fiber_pool_free_batch(__cilkrts_worker *w, parent->fibers[parent->size++] = pool->fibers[--pool->size]; } CILK_ASSERT(w, parent->size <= parent->capacity); -#if FIBER_STATS parent->stats.in_use -= to_parent; if (parent->size > parent->stats.max_free) { parent->stats.max_free = parent->size; } -#endif fiber_pool_unlock(w, parent); } if ((batch_size - to_parent) > 0) { // still need to free more @@ -265,7 +245,6 @@ static void fiber_pool_free_batch(__cilkrts_worker *w, cilk_fiber_deallocate(w, fiber); } } - CILK_ASSERT(w, pool->size >= 0); } //========================================================= @@ -287,15 +266,20 @@ void cilk_fiber_pool_global_init(global_state *g) { * stats and print them out (if FIBER_STATS is set) */ void cilk_fiber_pool_global_terminate(global_state *g) { - fiber_pool_stat_print(g); + struct cilk_fiber_pool *pool = &g->fiber_pool; + cilk_mutex_lock(&pool->lock); /* probably not needed */ + while (pool->size > 0) { + struct cilk_fiber *fiber = pool->fibers[--pool->size]; + cilk_fiber_deallocate_global(g, fiber); + } + cilk_mutex_unlock(&pool->lock); + if (ALERT_ENABLED(FIBER_SUMMARY)) + fiber_pool_stat_print(g); } /* Global fiber pool clean up. */ void cilk_fiber_pool_global_destroy(global_state *g) { - - struct cilk_fiber_pool *pool = &(g->fiber_pool); - CILK_ASSERT_G(pool->size == 0); // worker 0 should have freed everything - fiber_pool_destroy(pool); + fiber_pool_destroy(&g->fiber_pool); // worker 0 should have freed everything } /** @@ -319,28 +303,20 @@ void cilk_fiber_pool_per_worker_init(__cilkrts_worker *w) { * stats and print them out (if FIBER_STATS is set) */ void cilk_fiber_pool_per_worker_terminate(__cilkrts_worker *w) { - // nothing to do at the moment + struct cilk_fiber_pool *pool = &(w->l->fiber_pool); + while (pool->size > 0) { + unsigned index = --pool->size; + struct cilk_fiber *fiber = pool->fibers[index]; + pool->fibers[index] = NULL; + cilk_fiber_deallocate(w, fiber); + } } /* Per-worker fiber pool clean up. */ void cilk_fiber_pool_per_worker_destroy(__cilkrts_worker *w) { struct cilk_fiber_pool *pool = &(w->l->fiber_pool); - for (unsigned i = 0; i < pool->size; i++) { - cilk_fiber_deallocate(w, pool->fibers[i]); - } fiber_pool_destroy(pool); - - // ANGE FIXME: is there a better way to do this? - // worker 0 responsible for freeing fibers in global pool - // need to do this here, since we can't free fibers into internal malloc - // without having a worker pointer - if (w->self == 0) { - struct cilk_fiber_pool *parent = &(w->g->fiber_pool); - fiber_pool_lock(w, parent); - fiber_pool_free_batch(w, parent, parent->size); - fiber_pool_unlock(w, parent); - } } /** @@ -352,14 +328,11 @@ struct cilk_fiber *cilk_fiber_allocate_from_pool(__cilkrts_worker *w) { if (pool->size == 0) { fiber_pool_allocate_batch(w, pool, pool->capacity / BATCH_FRACTION); } - CILK_ASSERT(w, pool->size > 0); struct cilk_fiber *ret = pool->fibers[--pool->size]; -#if WHEN_FIBER_STATS pool->stats.in_use++; if (pool->stats.in_use > pool->stats.max_in_use) { pool->stats.max_in_use = pool->stats.in_use; } -#endif CILK_ASSERT(w, ret); return ret; } @@ -378,11 +351,10 @@ void cilk_fiber_deallocate_to_pool(__cilkrts_worker *w, } if (fiber_to_return) { pool->fibers[pool->size++] = fiber_to_return; -#if WHEN_FIBER_STATS pool->stats.in_use--; if (pool->size > pool->stats.max_free) { pool->stats.max_free = pool->size; } -#endif + fiber_to_return = NULL; } } diff --git a/runtime/fiber.c b/runtime/fiber.c index 592f3e0c..a3e890c7 100644 --- a/runtime/fiber.c +++ b/runtime/fiber.c @@ -10,6 +10,16 @@ #include "fiber.h" #include "init.h" +#include /* DEBUG */ + +struct cilk_fiber { + char *alloc_low; // first byte of mmap-ed region + char *stack_low; // lowest usable byte of stack + char *stack_high; // one byte above highest usable byte of stack + char *alloc_high; // last byte of mmap-ed region + __cilkrts_worker *owner; // worker using this fiber +}; + #ifndef MAP_GROWSDOWN /* MAP_GROWSDOWN is implied on BSD */ #define MAP_GROWSDOWN 0 @@ -19,6 +29,9 @@ #define MAP_STACK 0 #endif +#define LOW_GUARD_PAGES 1 +#define HIGH_GUARD_PAGES 1 + //=============================================================== // This file maintains fiber-related function that requires // the internals of a fiber. The management of the fiber pools @@ -35,6 +48,7 @@ static void make_stack(struct cilk_fiber *f, size_t stack_size) { const size_t page_size = 1U << page_shift; size_t stack_pages = (stack_size + page_size - 1) >> cheetah_page_shift; + stack_pages += LOW_GUARD_PAGES + HIGH_GUARD_PAGES; /* Stacks must be at least MIN_NUM_PAGES_PER_STACK pages, a count which includes two guard pages. */ @@ -43,95 +57,71 @@ static void make_stack(struct cilk_fiber *f, size_t stack_size) { } else if (stack_pages > MAX_NUM_PAGES_PER_STACK) { stack_pages = MAX_NUM_PAGES_PER_STACK; } - char *stack_high; - char *stack_low = (char *)mmap( + char *alloc_low = (char *)mmap( 0, stack_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK | MAP_GROWSDOWN, -1, 0); - if (MAP_FAILED == stack_low) { + if (MAP_FAILED == alloc_low) { cilkrts_bug(NULL, "Cilk: stack mmap failed"); /* Currently unreached. TODO: Investigate more graceful error handling. */ - stack_low = NULL; - stack_high = NULL; - } else { - stack_high = stack_low + (stack_pages - 1) * page_size; - // mprotect guard pages. - mprotect(stack_low, page_size, PROT_NONE); - mprotect(stack_high, page_size, PROT_NONE); - stack_low += page_size; + f->alloc_low = NULL; + f->stack_low = NULL; + f->stack_high = NULL; + f->alloc_high = NULL; + return; } - // m_stack points to the first usable byte - // m_stack_base points after the last usable byte - f->m_stack = stack_low; - f->m_stack_base = stack_high; + char *alloc_high = alloc_low + stack_pages * page_size; + char *stack_high = alloc_high - page_size; + char *stack_low = alloc_low + page_size; + // mprotect guard pages. + mprotect(alloc_low, page_size, PROT_NONE); + mprotect(stack_high, page_size, PROT_NONE); + f->alloc_low = alloc_low; + f->stack_low = stack_low; + f->stack_high = stack_high; + f->alloc_high = alloc_high; + if (DEBUG_ENABLED(MEMORY_SLOW)) + memset(stack_low, 0x11, stack_size); } static void free_stack(struct cilk_fiber *f) { - if (f->m_stack) { - const size_t page_size = 1U << cheetah_page_shift; - char *padded_low = f->m_stack - page_size; - char *padded_high = f->m_stack_base + page_size; - if (munmap(padded_low, padded_high - padded_low) < 0) + if (f->alloc_low) { + if (DEBUG_ENABLED(MEMORY_SLOW)) + memset(f->stack_low, 0xbb, f->stack_high - f->stack_low); + if (munmap(f->alloc_low, f->alloc_high - f->alloc_low) < 0) cilkrts_bug(NULL, "Cilk: stack munmap failed"); + f->alloc_low = NULL; + f->stack_low = NULL; + f->stack_high = NULL; + f->alloc_high = NULL; } } static void fiber_init(struct cilk_fiber *fiber) { - fiber->m_stack = NULL; - fiber->m_stack_base = NULL; + fiber->alloc_low = NULL; + fiber->stack_low = NULL; + fiber->stack_high = NULL; + fiber->alloc_high = NULL; fiber->owner = NULL; } -/* - * Restore the floating point state that is stored in a stack frame at each - * spawn. This should be called each time a frame is resumed. OpenCilk - * only saves MXCSR. The 80387 status word is obsolete. - */ -static void sysdep_restore_fp_state(__cilkrts_stack_frame *sf) { - /* TODO: Find a way to do this only when using floating point. */ -#ifdef CHEETAH_SAVE_MXCSR -#if 1 - asm volatile("ldmxcsr %0" : : "m"(sf->mxcsr)); -#else - /* Disabled because LLVM's implementation is bad. */ - __builtin_ia32_ldmxcsr(sf->mxcsr); /* aka _mm_getcsr */ -#endif -#endif - -#ifdef __AVX__ - /* VZEROUPPER improves performance when mixing SSE and AVX code. - VZEROALL would work as well here because vector registers are - dead but takes about 10 cycles longer. */ - __builtin_ia32_vzeroupper(); -#endif -} //=============================================================== // Supported public functions //=============================================================== -void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf) { -#ifdef CHEETAH_SAVE_MXCSR -#if 1 - asm("stmxcsr %0" : "=m"(sf->mxcsr)); -#else - /* Disabled because LLVM's implementation is bad. */ - sf->mxcsr = __builtin_ia32_stmxcsr(); /* aka _mm_setcsr */ -#endif -#endif -} char *sysdep_reset_stack_for_resume(struct cilk_fiber *fiber, __cilkrts_stack_frame *sf) { CILK_ASSERT_G(fiber); - /* m_stack_base of the new fiber is aligned to a page size + /* stack_high of the new fiber is aligned to a page size boundary just after usable memory. */ /* JFC: This may need to be more than 256 if the stolen function has more than 256 bytes of outgoing arguments. I think Cilk++ looked at fp-sp in the stolen function. It should also exceed frame_size in init_fiber_run. */ size_t align = MAX_STACK_ALIGN > 256 ? MAX_STACK_ALIGN : 256; - char *sp = fiber->m_stack_base - align; + char *sp = fiber->stack_high - align; SP(sf) = sp; /* Debugging: make sure stack is accessible. */ @@ -142,7 +132,7 @@ char *sysdep_reset_stack_for_resume(struct cilk_fiber *fiber, CHEETAH_INTERNAL_NORETURN void sysdep_longjmp_to_sf(__cilkrts_stack_frame *sf) { - cilkrts_alert(ALERT_FIBER, sf->worker, "longjmp to sf, BP/SP/PC: %p/%p/%p", + cilkrts_alert(FIBER, sf->worker, "longjmp to sf, BP/SP/PC: %p/%p/%p", FP(sf), SP(sf), PC(sf)); #if defined CHEETAH_SAVE_MXCSR @@ -153,76 +143,53 @@ void sysdep_longjmp_to_sf(__cilkrts_stack_frame *sf) { __builtin_longjmp(sf->ctx, 1); } -CHEETAH_INTERNAL_NORETURN -void init_fiber_run(__cilkrts_worker *w, struct cilk_fiber *fiber, - __cilkrts_stack_frame *sf) { - // owner of fiber not set at the moment - cilkrts_alert(ALERT_FIBER, w, "(cilk_fiber_run) starting fiber %p", fiber); - - /* The if-else block is a longwinded way of changing the stack pointer - onto the fiber. A single assembly instruction would be sufficient - if the compiler understood it could not save stack addresses in - registers across the operation. - - TODO 1: It would probably be simpler to write a little assembly - language for each target. - TODO 2: A comment in the old Cilk code said longjmp should not - be used to return to a setjmp in the same function. */ - if (__builtin_setjmp(sf->ctx) == 0) { - size_t frame_size = (size_t)FP(sf) - (size_t)SP(sf); - /* This should not be needed if the original frame pointer - is aligned, but the old Cilk code aligned the stack and - doing it doesn't cost much. */ - frame_size = - (frame_size + MAX_STACK_ALIGN - 1) & ~(MAX_STACK_ALIGN - 1); - - /* The stack frame should be small. If it exceeeds 1000 bytes - there is probably a bug in the frame size calculation, e.g. - the compiler may have eliminated the frame pointer. */ - CILK_ASSERT_G(frame_size <= 1000); - - /* Switch to the fiber reserving frame_size bytes for this - function's stack. */ - SP(sf) = fiber->m_stack_base - frame_size; - __builtin_longjmp(sf->ctx, 1); - } else { - // fiber is set up; now we longjmp into invoke_main; switch sched_stats - CILK_STOP_TIMING(w, INTERVAL_SCHED); - CILK_START_TIMING(w, INTERVAL_WORK); - invoke_main(); - } - CILK_ASSERT_G(0); // should never get here -} - -struct cilk_fiber *cilk_fiber_allocate(__cilkrts_worker *w) { - struct cilk_fiber *fiber = cilk_internal_malloc(w, sizeof(*fiber)); +struct cilk_fiber *cilk_fiber_allocate(__cilkrts_worker *w, size_t stacksize) { + struct cilk_fiber *fiber = + cilk_internal_malloc(w, sizeof(*fiber), IM_FIBER); fiber_init(fiber); - make_stack(fiber, DEFAULT_STACK_SIZE); // default ~1MB stack - cilkrts_alert(ALERT_FIBER, w, "Allocate fiber %p [%p--%p]", fiber, - fiber->m_stack_base, fiber->m_stack); + make_stack(fiber, stacksize); + cilkrts_alert(FIBER, w, "Allocate fiber %p [%p--%p]", (void *)fiber, + (void *)fiber->stack_low, (void *)fiber->stack_high); return fiber; } void cilk_fiber_deallocate(__cilkrts_worker *w, struct cilk_fiber *fiber) { - cilkrts_alert(ALERT_FIBER, w, "Deallocate fiber %p [%p--%p]", fiber, - fiber->m_stack_base, fiber->m_stack); + cilkrts_alert(FIBER, w, "Deallocate fiber %p [%p--%p]", (void *)fiber, + (void *)fiber->stack_low, (void *)fiber->stack_high); + if (DEBUG_ENABLED_STATIC(FIBER)) + CILK_ASSERT(w, !in_fiber(fiber, w->current_stack_frame)); free_stack(fiber); - cilk_internal_free(w, fiber, sizeof(*fiber)); + cilk_internal_free(w, fiber, sizeof(*fiber), IM_FIBER); +} + +void cilk_fiber_deallocate_global(struct global_state *g, + struct cilk_fiber *fiber) { + cilkrts_alert(FIBER, NULL, "Deallocate fiber %p [%p--%p]", (void *)fiber, + (void *)fiber->stack_low, (void *)fiber->stack_high); + free_stack(fiber); + cilk_internal_free_global(g, fiber, sizeof(*fiber), IM_FIBER); } struct cilk_fiber *cilk_main_fiber_allocate() { struct cilk_fiber *fiber = malloc(sizeof(*fiber)); fiber_init(fiber); make_stack(fiber, DEFAULT_STACK_SIZE); // default ~1MB stack - cilkrts_alert(ALERT_FIBER, NULL, "[?]: Allocate main fiber %p [%p--%p]", - fiber, fiber->m_stack_base, fiber->m_stack); + cilkrts_alert(FIBER, NULL, "[?]: Allocate main fiber %p [%p--%p]", + (void *)fiber, (void *)fiber->stack_low, + (void *)fiber->stack_high); return fiber; } void cilk_main_fiber_deallocate(struct cilk_fiber *fiber) { - cilkrts_alert(ALERT_FIBER, NULL, "[?]: Deallocate main fiber %p [%p--%p]", - fiber, fiber->m_stack_base, fiber->m_stack); + cilkrts_alert(FIBER, NULL, "[?]: Deallocate main fiber %p [%p--%p]", + (void *)fiber, (void *)fiber->stack_low, + (void *)fiber->stack_high); free_stack(fiber); free(fiber); } + +int in_fiber(struct cilk_fiber *fiber, void *p) { + void *low = fiber->stack_low, *high = fiber->stack_high; + return p >= low && p < high; +} diff --git a/runtime/fiber.h b/runtime/fiber.h index 077e47e8..454c1040 100644 --- a/runtime/fiber.h +++ b/runtime/fiber.h @@ -8,29 +8,21 @@ #include -#define FIBER_STATS CILK_STATS - -#if FIBER_STATS -#define WHEN_FIBER_STATS(ex) ex -#else -#define WHEN_FIBER_STATS(ex) -#endif - //=============================================================== // Struct defs used by fibers, fiber pools //=============================================================== // Statistics on active fibers that were allocated from this pool, struct fiber_pool_stats { - unsigned int - in_use; // number of fibers allocated - freed from / into the pool - unsigned int max_in_use; // high watermark for in_use - unsigned int - max_free; // high watermark for number of free fibers in the pool + int in_use; // number of fibers allocated - freed from / into the pool + int max_in_use; // high watermark for in_use + unsigned max_free; // high watermark for number of free fibers in the pool }; struct cilk_fiber_pool { - cilk_mutex *lock; // Mutual exclusion for pool operations + cilk_mutex lock; + worker_id mutex_owner; + int shared; size_t stack_size; // Size of stacks for fibers in this pool. struct cilk_fiber_pool *parent; // Parent pool. // If this pool is empty, get from parent @@ -38,35 +30,58 @@ struct cilk_fiber_pool { struct cilk_fiber **fibers; // Array of max_size fiber pointers unsigned int capacity; // Limit on number of fibers in pool unsigned int size; // Number of fibers currently in the pool - worker_id mutex_owner; - WHEN_FIBER_STATS(struct fiber_pool_stats stats); + struct fiber_pool_stats stats; }; -struct cilk_fiber { - char *m_stack; // lowest usable byte of stack - char *m_stack_base; // one byte above highest usable byte of stack - __cilkrts_worker *owner; // worker using this fiber -}; +struct cilk_fiber; // opaque type //=============================================================== // Supported functions //=============================================================== -static inline void cilk_fiber_set_owner(struct cilk_fiber *fiber, - __cilkrts_worker *owner) { - fiber->owner = owner; +static inline __attribute__((always_inline)) void +sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf) { +#ifdef CHEETAH_SAVE_MXCSR +#if 1 + __asm__("stmxcsr %0" : "=m"(sf->mxcsr)); +#else + /* Disabled because LLVM's implementation is bad. */ + sf->mxcsr = __builtin_ia32_stmxcsr(); /* aka _mm_setcsr */ +#endif +#endif +} + +/* + * Restore the floating point state that is stored in a stack frame at each + * spawn. This should be called each time a frame is resumed. OpenCilk + * only saves MXCSR. The 80387 status word is obsolete. + */ +static inline +__attribute__((always_inline)) +void sysdep_restore_fp_state(__cilkrts_stack_frame *sf) { + /* TODO: Find a way to do this only when using floating point. */ +#ifdef CHEETAH_SAVE_MXCSR +#if 1 + __asm__ volatile("ldmxcsr %0" : : "m"(sf->mxcsr)); +#else + /* Disabled because LLVM's implementation is bad. */ + __builtin_ia32_ldmxcsr(sf->mxcsr); /* aka _mm_getcsr */ +#endif +#endif + +#ifdef __AVX__ + /* VZEROUPPER improves performance when mixing SSE and AVX code. + VZEROALL would work as well here because vector registers are + dead but takes about 10 cycles longer. */ + __builtin_ia32_vzeroupper(); +#endif } -CHEETAH_INTERNAL -void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf); CHEETAH_INTERNAL char *sysdep_reset_stack_for_resume(struct cilk_fiber *fiber, __cilkrts_stack_frame *sf); CHEETAH_INTERNAL_NORETURN void sysdep_longjmp_to_sf(__cilkrts_stack_frame *sf); -CHEETAH_INTERNAL_NORETURN -void init_fiber_run(__cilkrts_worker *w, struct cilk_fiber *fiber, - __cilkrts_stack_frame *sf); CHEETAH_INTERNAL void cilk_fiber_pool_global_init(global_state *g); CHEETAH_INTERNAL void cilk_fiber_pool_global_terminate(global_state *g); @@ -77,9 +92,11 @@ CHEETAH_INTERNAL void cilk_fiber_pool_per_worker_destroy(__cilkrts_worker *w); // allocate / deallocate one fiber from / back to OS CHEETAH_INTERNAL -struct cilk_fiber *cilk_fiber_allocate(__cilkrts_worker *w); +struct cilk_fiber *cilk_fiber_allocate(__cilkrts_worker *w, size_t stacksize); CHEETAH_INTERNAL void cilk_fiber_deallocate(__cilkrts_worker *w, struct cilk_fiber *fiber); +CHEETAH_INTERNAL +void cilk_fiber_deallocate_global(global_state *, struct cilk_fiber *fiber); // allocate / deallocate fiber from / back to OS for the invoke-main CHEETAH_INTERNAL struct cilk_fiber *cilk_main_fiber_allocate(); @@ -92,4 +109,6 @@ CHEETAH_INTERNAL void cilk_fiber_deallocate_to_pool(__cilkrts_worker *w, struct cilk_fiber *fiber); +CHEETAH_INTERNAL int in_fiber(struct cilk_fiber *, void *); + #endif diff --git a/runtime/global.c b/runtime/global.c index e1448195..b277391a 100644 --- a/runtime/global.c +++ b/runtime/global.c @@ -1,4 +1,6 @@ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include #include @@ -6,16 +8,25 @@ #include #include /* _SC_NPROCESSORS_ONLN */ -#include "cmdline.h" #include "debug.h" #include "global.h" #include "init.h" #include "readydeque.h" #include "reducer_impl.h" +global_state *default_cilkrts; + +extern CHEETAH_INTERNAL unsigned cilkg_nproc; + +static void set_alert_debug_level() { + /* Only the bits also set in ALERT_LVL are used. */ + set_alert_level(env_get_int("CILK_ALERT")); + /* Only the bits also set in DEBUG_LVL are used. */ + set_debug_level(env_get_int("CILK_DEBUG")); +} + static global_state *global_state_allocate() { - parse_environment(); /* sets alert level */ - cilkrts_alert(ALERT_BOOT, NULL, + cilkrts_alert(BOOT, NULL, "(global_state_init) Allocating global state"); global_state *g = (global_state *)cilk_aligned_alloc( __alignof(global_state), sizeof(global_state)); @@ -24,28 +35,69 @@ static global_state *global_state_allocate() { cilk_mutex_init(&g->im_lock); cilk_mutex_init(&g->print_lock); + // TODO: Convert to cilk_* equivalents + pthread_mutex_init(&g->cilkified_lock, NULL); + pthread_cond_init(&g->cilkified_cond_var, NULL); + pthread_mutex_init(&g->start_lock, NULL); + pthread_cond_init(&g->start_cond_var, NULL); + return g; } -global_state *global_state_init(int argc, char *argv[]) { - cilkrts_alert(ALERT_BOOT, NULL, - "(global_state_init) Initializing global state"); +// Methods for setting runtime options. +static void set_stacksize(global_state *g, size_t stacksize) { + // TODO: Verify that g has not yet been initialized. + CILK_ASSERT_G(!g->workers_started); + CILK_ASSERT_G(stacksize >= 16384); + CILK_ASSERT_G(stacksize <= 100 * 1024 * 1024); + g->options.stacksize = stacksize; +} -#ifdef DEBUG - setlinebuf(stderr); -#endif +static void set_deqdepth(global_state *g, unsigned int deqdepth) { + // TODO: Verify that g has not yet been initialized. + CILK_ASSERT_G(!g->workers_started); + CILK_ASSERT_G(deqdepth >= 1); + CILK_ASSERT_G(deqdepth <= 99999); + g->options.deqdepth = deqdepth; +} - global_state *g = global_state_allocate(); +static void set_fiber_pool_cap(global_state *g, unsigned int fiber_pool_cap) { + // TODO: Verify that g has not yet been initialized. + CILK_ASSERT_G(!g->workers_started); + CILK_ASSERT_G(fiber_pool_cap >= 8); + CILK_ASSERT_G(fiber_pool_cap <= 999999); + g->options.fiber_pool_cap = fiber_pool_cap; +} - if (parse_command_line(&g->options, &argc, argv)) { - // user invoked --help; quit - free(g); - exit(0); - } +// not marked as static as it's called by __cilkrts_internal_set_nworkers +// used by Cilksan to set nworker to 1 +void set_nworkers(global_state *g, unsigned int nworkers) { + CILK_ASSERT_G(!g->workers_started); + CILK_ASSERT_G(nworkers <= g->options.nproc); + CILK_ASSERT_G(nworkers > g->exiting_worker); + g->nworkers = nworkers; +} - parse_environment(); +// not marked as static as it's called by __cilkrts_internal_set_force_reduce +// used by Cilksan to set force reduction +void set_force_reduce(global_state *g, unsigned int force_reduce) { + CILK_ASSERT_G(!g->workers_started); + g->options.force_reduce = force_reduce; +} - int proc_override = env_get_int("CILK_NWORKERS"); +// Set global RTS options from environment variables. +static void parse_rts_environment(global_state *g) { + size_t stacksize = env_get_int("CILK_STACKSIZE"); + if (stacksize > 0) + set_stacksize(g, stacksize); + unsigned int deqdepth = env_get_int("CILK_DEQDEPTH"); + if (deqdepth > 0) + set_deqdepth(g, deqdepth); + unsigned int fiber_pool_cap = env_get_int("CILK_FIBER_POOL"); + if (fiber_pool_cap > 0) + set_fiber_pool_cap(g, fiber_pool_cap); + + long proc_override = env_get_int("CILK_NWORKERS"); if (g->options.nproc == 0) { // use the number of cores online right now int available_cores = 0; @@ -78,8 +130,8 @@ global_state *global_state_init(int argc, char *argv[]) { // an environment variable indicating whether we are running a bench // with cilksan and should check for reducer race. g->options.force_reduce = env_get_int("CILK_FORCE_REDUCE"); - if(g->options.force_reduce != 0) { - if(proc_override != 1) { + if (g->options.force_reduce != 0) { + if (proc_override != 1) { printf("CILK_FORCE_REDUCE is set to non-zero\n" "but CILK_NWORKERS is not set to 1. Running normally.\n"); g->options.force_reduce = 0; @@ -91,14 +143,35 @@ global_state *global_state_init(int argc, char *argv[]) { } fflush(stdout); } +} + +global_state *global_state_init(int argc, char *argv[]) { + cilkrts_alert(BOOT, NULL, + "(global_state_init) Initializing global state"); + +#ifdef DEBUG + setlinebuf(stderr); +#endif + + set_alert_debug_level(); // alert / debug used by global_state_allocate + global_state *g = global_state_allocate(); + + g->options = (struct rts_options)DEFAULT_OPTIONS; + parse_rts_environment(g); unsigned active_size = g->options.nproc; CILK_ASSERT_G(active_size > 0); + g->nworkers = active_size; cilkg_nproc = active_size; - g->invoke_main_initialized = false; + g->workers_started = false; + g->root_closure_initialized = false; atomic_store_explicit(&g->start, 0, memory_order_relaxed); atomic_store_explicit(&g->done, 0, memory_order_relaxed); + atomic_store_explicit(&g->cilkified, 0, memory_order_relaxed); + g->terminate = false; + g->exiting_worker = 0; + atomic_store_explicit(&g->reducer_map_count, 0, memory_order_relaxed); g->workers = (__cilkrts_worker **)calloc(active_size, sizeof(__cilkrts_worker *)); @@ -109,34 +182,20 @@ global_state *global_state_init(int argc, char *argv[]) { cilk_fiber_pool_global_init(g); cilk_global_sched_stats_init(&(g->stats)); - g->cilk_main_argc = argc; - g->cilk_main_args = argv; - g->id_manager = NULL; - /* This must match the compiler */ - uint32_t hash = __CILKRTS_ABI_VERSION; - - hash *= 13; - hash += offsetof(struct __cilkrts_stack_frame, worker); - hash *= 13; - hash += offsetof(struct __cilkrts_stack_frame, ctx); - hash *= 13; - hash += offsetof(struct __cilkrts_stack_frame, magic); - hash *= 13; - hash += offsetof(struct __cilkrts_stack_frame, flags); - hash *= 13; - hash += offsetof(struct __cilkrts_stack_frame, call_parent); -#if defined __i386__ || defined __x86_64__ - hash *= 13; -#ifdef __SSE__ - hash += offsetof(struct __cilkrts_stack_frame, mxcsr); -#else - hash += offsetof(struct __cilkrts_stack_frame, reserved1); -#endif -#endif + return g; +} - g->frame_magic = hash; +void for_each_worker(global_state *g, void (*fn)(__cilkrts_worker *, void *), + void *data) { + for (unsigned i = 0; i < g->options.nproc; ++i) + fn(g->workers[i], data); +} - return g; +void for_each_worker_rev(global_state *g, + void (*fn)(__cilkrts_worker *, void *), void *data) { + unsigned i = g->options.nproc; + while (i-- > 0) + fn(g->workers[i], data); } diff --git a/runtime/global.h b/runtime/global.h index 0ddc4ee1..fc982652 100644 --- a/runtime/global.h +++ b/runtime/global.h @@ -9,7 +9,7 @@ #include "debug.h" #include "fiber.h" -#include "internal-malloc.h" +#include "internal-malloc-impl.h" #include "mutex.h" #include "rts-config.h" #include "sched_stats.h" @@ -32,46 +32,70 @@ struct Closure; // clang-format on struct rts_options { - size_t stacksize; - unsigned int nproc; + size_t stacksize; /* can be set via env variable CILK_STACKSIZE */ + unsigned int nproc; /* can be set via env variable CILK_NWORKERS */ unsigned int reducer_cap; - unsigned int deqdepth; - unsigned int fiber_pool_cap; - unsigned int force_reduce; + unsigned int deqdepth; /* can be set via env variable CILK_DEQDEPTH */ + unsigned int fiber_pool_cap; /* can be set via env variable CILK_FIBER_POOL */ + unsigned int force_reduce; /* can be set via env variable CILK_FORCE_REDUCE */ }; struct global_state { /* globally-visible options (read-only after init) */ struct rts_options options; + unsigned int nworkers; /* size of next 3 arrays */ + struct __cilkrts_worker **workers; /* dynamically-allocated array of deques, one per processor */ struct ReadyDeque *deques; - struct __cilkrts_worker **workers; pthread_t *threads; - struct Closure *invoke_main; + struct Closure *root_closure; struct cilk_fiber_pool fiber_pool __attribute__((aligned(CILK_CACHE_LINE))); struct global_im_pool im_pool __attribute__((aligned(CILK_CACHE_LINE))); struct cilk_im_desc im_desc __attribute__((aligned(CILK_CACHE_LINE))); cilk_mutex im_lock; // lock for accessing global im_desc - uint32_t frame_magic; - - volatile bool invoke_main_initialized; + volatile bool workers_started; + volatile bool root_closure_initialized; volatile atomic_bool start; volatile atomic_bool done; - volatile atomic_int cilk_main_return; + volatile atomic_bool cilkified; + volatile bool terminate; + volatile worker_id exiting_worker; + volatile atomic_uint reducer_map_count; cilk_mutex print_lock; // global lock for printing messages - int cilk_main_argc; - char **cilk_main_args; + pthread_mutex_t cilkified_lock; + pthread_cond_t cilkified_cond_var; + pthread_mutex_t start_lock; + pthread_cond_t start_cond_var; struct reducer_id_manager *id_manager; /* null while Cilk is running */ struct global_sched_stats stats; }; +extern global_state *default_cilkrts; + +CHEETAH_INTERNAL void set_nworkers(global_state *g, unsigned int nworkers); +CHEETAH_INTERNAL void set_force_reduce(global_state *g, + unsigned int force_reduce); CHEETAH_INTERNAL global_state *global_state_init(int argc, char *argv[]); +CHEETAH_INTERNAL void for_each_worker(global_state *, + void (*)(__cilkrts_worker *, void *), + void *data); +CHEETAH_INTERNAL void for_each_worker_rev(global_state *, + void (*)(__cilkrts_worker *, void *), + void *data); + +// util functions used by both init.c and global.c +inline static long env_get_int(char const *var) { + const char *envstr = getenv(var); + if (envstr) + return strtol(envstr, NULL, 0); + return 0; +} #endif /* _CILK_GLOBAL_H */ diff --git a/runtime/init.c b/runtime/init.c index 8b960bc1..4b36ca57 100644 --- a/runtime/init.c +++ b/runtime/init.c @@ -1,4 +1,6 @@ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include #include #include @@ -21,6 +23,7 @@ #include "fiber.h" #include "global.h" #include "init.h" +#include "local.h" #include "readydeque.h" #include "sched_stats.h" #include "scheduler.h" @@ -34,22 +37,10 @@ extern void cleanup_invoke_main(Closure *invoke_main); typedef cpuset_t cpu_set_t; #endif -long env_get_int(char const *var) { - const char *envstr = getenv(var); - if (envstr) - return strtol(envstr, NULL, 0); - return 0; -} - -void parse_environment() { - // ANGE: I don't think we should expose this ... - // alert_level = env_get_int("CILK_ALERT"); -} - static local_state *worker_local_init(global_state *g) { - local_state *l = (local_state *)malloc(sizeof(local_state)); - l->shadow_stack = (__cilkrts_stack_frame **)malloc( - g->options.deqdepth * sizeof(struct __cilkrts_stack_frame *)); + local_state *l = (local_state *)calloc(1, sizeof(local_state)); + l->shadow_stack = (__cilkrts_stack_frame **)calloc( + g->options.deqdepth, sizeof(struct __cilkrts_stack_frame *)); for (int i = 0; i < JMPBUF_SIZE; i++) { l->rts_ctx[i] = NULL; } @@ -64,7 +55,7 @@ static local_state *worker_local_init(global_state *g) { } static void deques_init(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, "(deques_init) Initializing deques"); + cilkrts_alert(BOOT, NULL, "(deques_init) Initializing deques"); for (unsigned int i = 0; i < g->options.nproc; i++) { g->deques[i].top = NULL; g->deques[i].bottom = NULL; @@ -74,10 +65,9 @@ static void deques_init(global_state *g) { } static void workers_init(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, "(workers_init) Initializing workers"); + cilkrts_alert(BOOT, NULL, "(workers_init) Initializing workers"); for (unsigned int i = 0; i < g->options.nproc; i++) { - cilkrts_alert(ALERT_BOOT, NULL, "(workers_init) Initializing worker %u", - i); + cilkrts_alert(BOOT, NULL, "(workers_init) Initializing worker %u", i); __cilkrts_worker *w = (__cilkrts_worker *)cilk_aligned_alloc( __alignof__(__cilkrts_worker), sizeof(__cilkrts_worker)); w->self = i; @@ -100,33 +90,57 @@ static void workers_init(global_state *g) { static void *scheduler_thread_proc(void *arg) { __cilkrts_worker *w = (__cilkrts_worker *)arg; - cilkrts_alert(ALERT_BOOT, w, "scheduler_thread_proc"); + cilkrts_alert(BOOT, w, "scheduler_thread_proc"); __cilkrts_set_tls_worker(w); worker_id self = w->self; - /* This is a simple way to give the first thread a head start - so other threads don't spin waiting for it. */ - - int delay = 1 + self; + do { + // Wait for g->start == 1 to start executing the work-stealing loop. We + // use a condition variable to wait on g->start, because this approach + // seems to result in better performance. + pthread_mutex_lock(&w->g->start_lock); + while (!atomic_load_explicit(&w->g->start, memory_order_acquire)) { + pthread_cond_wait(&w->g->start_cond_var, &w->g->start_lock); + } + pthread_mutex_unlock(&w->g->start_lock); - while (!atomic_load_explicit(&w->g->start, memory_order_acquire)) { - usleep(delay); - if (delay < 64) { - delay *= 2; + // Check if we should exit this scheduling function. + if (w->g->terminate) { + return 0; } - } - /* TODO: Maybe import reducers here? They must be imported - before user code runs. */ + /* TODO: Maybe import reducers here? They must be imported + before user code runs. */ + + // Start the new Cilkified region using the last worker that finished a + // Cilkified region. This approach ensures that the new Cilkified + // region starts on an available worker with the worker state that was + // updated by any operations that occurred outside of Cilkified regions. + // Such operations, for example might have updated the left-most view of + // a reducer. + if (self == w->g->exiting_worker) { + worker_scheduler(w, w->g->root_closure); + } else { + worker_scheduler(w, NULL); + } - if (self == 0) { - worker_scheduler(w, w->g->invoke_main); - } else { - worker_scheduler(w, NULL); - } + // At this point, some worker will have finished the Cilkified region, + // meaning it recordied its ID in g->exiting_worker and set g->done = 1. + // That worker's state accurately reflects the execution of the + // Cilkified region, including all updates to reducers. Wait for that + // worker to exit the work-stealing loop, and use it to wake-up the + // original Cilkifying thread. + if (self == w->g->exiting_worker) { + // Mark the computation as no longer cilkified, to signal the thread + // that originally cilkified the execution. + pthread_mutex_lock(&(w->g->cilkified_lock)); + atomic_store_explicit(&w->g->cilkified, 0, memory_order_release); + pthread_cond_signal(&w->g->cilkified_cond_var); + pthread_mutex_unlock(&(w->g->cilkified_lock)); + } - return 0; + } while (true); } #ifdef CPU_SETSIZE @@ -179,11 +193,12 @@ static void threads_init(global_state *g) { break; } #endif - int n_threads = g->options.nproc; + int n_threads = g->nworkers; + CILK_ASSERT_G(n_threads > 0); /* TODO: Apple supports thread affinity using a different interface. */ - cilkrts_alert(ALERT_BOOT, NULL, "(threads_init) Setting up threads"); + cilkrts_alert(BOOT, NULL, "(threads_init) Setting up threads"); #ifdef CPU_SETSIZE /* Three cases: core count at least twice worker count, allocate @@ -224,8 +239,8 @@ static void threads_init(global_state *g) { ++cpu; } - cilkrts_alert(ALERT_BOOT, NULL, "Bind worker %u to core %d of %d", - w, cpu, available_cores); + cilkrts_alert(BOOT, NULL, "Bind worker %u to core %d of %d", w, cpu, + available_cores); CPU_CLR(cpu, &process_mask); cpu_set_t worker_mask; @@ -234,8 +249,7 @@ static void threads_init(global_state *g) { int off; for (off = 1; off < group_size; ++off) { move_bit(cpu + off * step_in, &worker_mask, &process_mask); - cilkrts_alert(ALERT_BOOT, NULL, - "Bind worker %u to core %d of %d", w, + cilkrts_alert(BOOT, NULL, "Bind worker %u to core %d of %d", w, cpu + off * step_in, available_cores); } cpu += step_out; @@ -249,35 +263,199 @@ static void threads_init(global_state *g) { usleep(10); } -global_state *__cilkrts_init(int argc, char *argv[]) { - cilkrts_alert(ALERT_BOOT, NULL, "(__cilkrts_init)"); +global_state *__cilkrts_startup(int argc, char *argv[]) { + cilkrts_alert(BOOT, NULL, "(__cilkrts_startup) argc %d", argc); global_state *g = global_state_init(argc, argv); reducers_init(g); __cilkrts_init_tls_variables(); workers_init(g); deques_init(g); - reducers_import(g, g->workers[0]); - threads_init(g); + CILK_ASSERT_G(0 == g->exiting_worker); + reducers_import(g, g->workers[g->exiting_worker]); + + // Create the root closure and a fiber to go with it. Use worker 0 to + // allocate the closure and fiber. + Closure *t = Closure_create(g->workers[g->exiting_worker]); + struct cilk_fiber *fiber = cilk_fiber_allocate( + g->workers[g->exiting_worker], g->options.stacksize); + t->fiber = fiber; + g->root_closure = t; return g; } +// Global constructor for starting up the default cilkrts. +__attribute__((constructor)) void __default_cilkrts_startup() { + default_cilkrts = __cilkrts_startup(0, NULL); + + for (unsigned i = 0; i < cilkrts_callbacks.last_init; ++i) + cilkrts_callbacks.init[i](); + + /* Any attempt to register more initializers should fail. */ + cilkrts_callbacks.after_init = true; +} + +void __cilkrts_internal_set_nworkers(unsigned int nworkers) { + set_nworkers(default_cilkrts, nworkers); +} + +void __cilkrts_internal_set_force_reduce(unsigned int force_reduce) { + set_force_reduce(default_cilkrts, force_reduce); +} + +// Start the Cilk workers in g, for example, by creating their underlying +// Pthreads. +static void __cilkrts_start_workers(global_state *g) { + threads_init(g); + g->workers_started = true; +} + +// Stop the Cilk workers in g, for example, by joining their underlying Pthreads. +static void __cilkrts_stop_workers(global_state *g) { + CILK_ASSERT_G(!atomic_load_explicit(&g->start, memory_order_acquire)); + CILK_ASSERT_G(CLOSURE_READY != g->root_closure->status); + + // Set g->start and g->terminate, to allow the workers to exit their + // outermost scheduling loop. Wake up any workers waiting on g->start. + g->terminate = true; + pthread_mutex_lock(&(g->start_lock)); + atomic_store_explicit(&g->start, 1, memory_order_release); + pthread_cond_broadcast(&g->start_cond_var); + pthread_mutex_unlock(&(g->start_lock)); + + // Join the worker pthreads + for (unsigned int i = 0; i < g->nworkers; i++) { + int status = pthread_join(g->threads[i], NULL); + if (status != 0) + cilkrts_bug(NULL, "Cilk runtime error: thread join (%u) failed: %d", + i, status); + } + cilkrts_alert(BOOT, NULL, "(threads_join) All workers joined!"); + g->workers_started = false; +} + +// Setup runtime structures to start a new Cilkified region. Executed by the +// Cilkifying thread in cilkify(). +void invoke_cilkified_root(global_state *g, __cilkrts_stack_frame *sf) { + CILK_ASSERT_G(!__cilkrts_get_tls_worker()); + + // Start the workers if necessary + if (!g->workers_started) + __cilkrts_start_workers(g); + + // Mark the root closure as not initialized + g->root_closure_initialized = false; + + // Mark the root closure as ready + Closure_make_ready(g->root_closure); + + // Setup the stack pointer to point at the root closure's fiber. + void *new_rsp = + (void *)sysdep_reset_stack_for_resume(g->root_closure->fiber, sf); + USE_UNUSED(new_rsp); + CILK_ASSERT_G(SP(sf) == new_rsp); + + // Mark that this root frame is last (meaning, at the top of the stack) + sf->flags |= CILK_FRAME_LAST; + // Mark this frame as stolen, to maintain invariants in the scheduler + __cilkrts_set_stolen(sf); + + // Associate sf with this root closure + g->root_closure->frame = sf; + + // Now we kick off execution of the Cilkified region by setting appropriate + // flags: + + // Set g->cilkified = 1, so the Cilkifying thread will wait for the + // Cilkified region to finish. + atomic_store_explicit(&g->cilkified, 1, memory_order_release); + // Set g->done = 0, so Cilk workers will continue trying to steal. + atomic_store_explicit(&g->done, 0, memory_order_release); + // Set g->start = 1 to unleash workers to enter the work-stealing loop. + // Wake up any workers waiting for this flag. + pthread_mutex_lock(&(g->start_lock)); + atomic_store_explicit(&g->start, 1, memory_order_release); + pthread_cond_broadcast(&g->start_cond_var); + pthread_mutex_unlock(&(g->start_lock)); +} + +// Block until signaled the Cilkified region is done. Executed by the Cilkfying +// thread. +void wait_until_cilk_done(global_state *g) { + // Wait on g->cilkified to be set to 0, indicating the end of the Cilkified + // region. We use a condition variable to wait on g->cilkified, because + // this approach seems to result in better performance. + + // TODO: Convert pthread_mutex_lock, pthread_mutex_unlock, and + // pthread_cond_wait to cilk_* equivalents. + pthread_mutex_lock(&(g->cilkified_lock)); + + // There may be a *very unlikely* scenario where the Cilk computation has + // already been completed before even starting to wait. In that case, do + // not wait and continue directly. Also handle spurious wakeups with a + // 'while' instead of an 'if'. + while (atomic_load_explicit(&g->cilkified, memory_order_acquire)) { + pthread_cond_wait(&(g->cilkified_cond_var), &(g->cilkified_lock)); + } + + pthread_mutex_unlock(&(g->cilkified_lock)); +} + +// Finish the execution of a Cilkified region. Executed by a worker in g. +void exit_cilkified_root(global_state *g, __cilkrts_stack_frame *sf) { + __cilkrts_worker *w = sf->worker; + + // Record this worker as the exiting worker. We keep track of this exiting + // worker so that code outside of Cilkified regions can use this worker's + // state, specifically, its reducer_map. We make sure to do this before + // setting done, so that other workers will properly observe the new + // exiting_worker. + g->exiting_worker = w->self; + + // Mark the computation as done. Also set start to false, so workers who + // exit the work-stealing loop will return to waiting for the start of the + // next Cilkified region. + atomic_store_explicit(&g->start, 0, memory_order_release); + atomic_store_explicit(&g->done, 1, memory_order_release); + + // Clear this worker's deque. Nobody can successfully steal from this deque + // at this point, because head == tail, but we still want any subsequent + // Cilkified region to start with an empty deque. + g->deques[w->self].bottom = (Closure *)NULL; + g->deques[w->self].top = (Closure *)NULL; + WHEN_CILK_DEBUG(g->root_closure->owner_ready_deque = NO_WORKER); + + // Clear the flags in sf. This routine runs before leave_frame in a Cilk + // function, but leave_frame is executed conditionally in Cilk functions + // based on whether sf->flags == 0. Clearing sf->flags ensures that the + // Cilkifying thread does not try to execute leave_frame. + CILK_ASSERT(w, __cilkrts_synced(sf)); + sf->flags = 0; + + // done; go back to runtime + longjmp_to_runtime(w); +} + static void global_state_terminate(global_state *g) { - cilk_fiber_pool_global_terminate(g); + cilk_fiber_pool_global_terminate(g); /* before malloc terminate */ cilk_internal_malloc_global_terminate(g); cilk_sched_stats_print(g); } static void global_state_deinit(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, - "(global_state_deinit) Clean up global state"); + cilkrts_alert(BOOT, NULL, "(global_state_deinit) Clean up global state"); - cleanup_invoke_main(g->invoke_main); cilk_fiber_pool_global_destroy(g); cilk_internal_malloc_global_destroy(g); // internal malloc last cilk_mutex_destroy(&(g->print_lock)); + // TODO: Convert to cilk_* equivalents + pthread_mutex_destroy(&g->cilkified_lock); + pthread_cond_destroy(&g->cilkified_cond_var); + pthread_mutex_destroy(&g->start_lock); + pthread_cond_destroy(&g->start_cond_var); free(g->workers); g->workers = NULL; + g->nworkers = 0; free(g->deques); g->deques = NULL; free(g->threads); @@ -288,36 +466,57 @@ static void global_state_deinit(global_state *g) { } static void deques_deinit(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, "(deques_deinit) Clean up deques"); + cilkrts_alert(BOOT, NULL, "(deques_deinit) Clean up deques"); for (unsigned int i = 0; i < g->options.nproc; i++) { CILK_ASSERT_G(g->deques[i].mutex_owner == NO_WORKER); cilk_mutex_destroy(&(g->deques[i].mutex)); } } +static void worker_terminate(__cilkrts_worker *w, void *data) { + cilk_fiber_pool_per_worker_terminate(w); + cilkred_map *rm = w->reducer_map; + w->reducer_map = NULL; + // Workers can have NULL reducer maps now. + if (rm) { + cilkred_map_destroy_map(w, rm); + } + cilk_internal_malloc_per_worker_terminate(w); // internal malloc last +} + static void workers_terminate(global_state *g) { - for (unsigned int i = 0; i < g->options.nproc; i++) { - __cilkrts_worker *w = g->workers[i]; - cilk_fiber_pool_per_worker_terminate(w); - cilk_internal_malloc_per_worker_terminate(w); // internal malloc last + for_each_worker_rev(g, worker_terminate, NULL); +} + +static void sum_allocations(__cilkrts_worker *w, void *data) { + long *counts = (long *)data; + local_state *l = w->l; + for (int i = 0; i < NUM_BUCKETS; ++i) { + counts[i] += l->im_desc.buckets[i].allocated; } } +static void wrap_fiber_pool_destroy(__cilkrts_worker *w, void *data) { + CILK_ASSERT(w, w->l->fiber_to_free == NULL); + cilk_fiber_pool_per_worker_destroy(w); +} + static void workers_deinit(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, "(workers_deinit) Clean up workers"); - for (unsigned int i = 0; i < g->options.nproc; i++) { - __cilkrts_worker *w = g->workers[i]; - g->workers[i] = NULL; - CILK_ASSERT(w, w->l->fiber_to_free == NULL); + cilkrts_alert(BOOT, NULL, "(workers_deinit) Clean up workers"); - cilkred_map *rm = w->reducer_map; - w->reducer_map = NULL; - // Workers can have NULL reducer maps now. - if (rm) { - cilkred_map_destroy_map(w, rm); - } + long allocations[NUM_BUCKETS] = {0, 0, 0, 0}; + + for_each_worker_rev(g, sum_allocations, allocations); - cilk_fiber_pool_per_worker_destroy(w); + if (DEBUG_ENABLED(MEMORY)) { + for (int i = 0; i < NUM_BUCKETS; ++i) + CILK_ASSERT_INDEX_ZERO(NULL, allocations, i, , "%ld"); + } + + unsigned i = g->options.nproc; + while (i-- > 0) { + __cilkrts_worker *w = g->workers[i]; + g->workers[i] = NULL; cilk_internal_malloc_per_worker_destroy(w); // internal malloc last free(w->l->shadow_stack); w->l->shadow_stack = NULL; @@ -325,13 +524,28 @@ static void workers_deinit(global_state *g) { w->l = NULL; free(w); } + /* TODO: Export initial reducer map */ } -CHEETAH_INTERNAL -void __cilkrts_cleanup(global_state *g) { +CHEETAH_INTERNAL void __cilkrts_shutdown(global_state *g) { + // If the workers are still running, stop them now. + if (g->workers_started) + __cilkrts_stop_workers(g); + + for (unsigned i = cilkrts_callbacks.last_exit; i > 0;) + cilkrts_callbacks.exit[--i](); + + // Deallocate the root closure and its fiber + cilk_fiber_deallocate_global(g, g->root_closure->fiber); + Closure_destroy_global(g, g->root_closure); + + // Cleanup the global state reducers_deinit(g); workers_terminate(g); + flush_alert_log(); + /* This needs to be before global_state_terminate for good stats. */ + for_each_worker(g, wrap_fiber_pool_destroy, NULL); // global_state_terminate collects and prints out stats, and thus // should occur *BEFORE* worker_deinit, because worker_deinit // deinitializes worker-related data structures which may @@ -340,8 +554,12 @@ void __cilkrts_cleanup(global_state *g) { // pools are not freed until workers_deinit. Thus the stats included on // internal-malloc that does not include all the free fibers. global_state_terminate(g); - workers_deinit(g); deques_deinit(g); global_state_deinit(g); } + +// Global destructor for shutting down the default cilkrts +__attribute__((destructor)) void __default_cilkrts_shutdown() { + __cilkrts_shutdown(default_cilkrts); +} diff --git a/runtime/init.h b/runtime/init.h index ebfb8818..8bf65d7b 100644 --- a/runtime/init.h +++ b/runtime/init.h @@ -3,12 +3,13 @@ #include "cilk-internal.h" -int cilk_main(int argc, char *argv[]); -CHEETAH_INTERNAL global_state *__cilkrts_init(int argc, char *argv[]); -CHEETAH_INTERNAL void __cilkrts_cleanup(global_state *); -CHEETAH_INTERNAL_NORETURN void invoke_main(); -CHEETAH_INTERNAL void parse_environment(); -CHEETAH_INTERNAL long env_get_int(char const *var); -CHEETAH_INTERNAL unsigned cilkg_nproc; +void invoke_cilkified_root(global_state *g, __cilkrts_stack_frame *sf); +void wait_until_cilk_done(global_state *g); +__attribute__((noreturn)) +void exit_cilkified_root(global_state *g, __cilkrts_stack_frame *sf); + +// Used by Cilksan to set nworkers to 1 and force reduction +void __cilkrts_internal_set_nworkers(unsigned int nworkers); +void __cilkrts_internal_set_force_reduce(unsigned int force_reduce); #endif /* _CILK_INIT_H */ diff --git a/runtime/internal-malloc-impl.h b/runtime/internal-malloc-impl.h new file mode 100644 index 00000000..0c249666 --- /dev/null +++ b/runtime/internal-malloc-impl.h @@ -0,0 +1,45 @@ +#ifndef _INTERAL_MALLOC_IMPL_H +#define _INTERAL_MALLOC_IMPL_H + +#include "debug.h" +#include "rts-config.h" + +#include "internal-malloc.h" + +#define NUM_BUCKETS 7 +#define NUM_IM_CALLERS 4 + +/* struct for managing global memory pool; each memory block in mem_list starts + out with size INTERNAL_MALLOC_CHUNK. We will allocate small pieces off the + memory block and free the pieces into per-worker im_descriptor free list. */ +struct global_im_pool { + char *mem_begin; // beginning of the free memory block that we are using + char *mem_end; // end of the free memory block that we are using + char **mem_list; // list of memory blocks obtained from system + unsigned mem_list_index; // index to the current mem block in use + unsigned mem_list_size; // length of the mem_list + size_t num_global_malloc; + size_t allocated; // bytes allocated into the pool + size_t wasted; // bytes at the end of a chunk that could not be used +}; + +struct im_bucket { + void *free_list; // beginning of free list + unsigned free_list_size; // Current size of free list + unsigned free_list_limit; // Maximum allowed size of free list + // Allocation count and wasted space on a worker may be negative + // if it frees blocks allocated elsewhere. In a global bucket + // these fields should never be negative. + int allocated; // Current allocations, in use or free + int max_allocated; // high watermark of allocated + long wasted; // in bytes +}; + +/* One of these per worker, and one global */ +struct cilk_im_desc { + struct im_bucket buckets[NUM_BUCKETS]; + long used; // local alloc - local free, may be negative + long num_malloc[IM_NUM_TAGS]; +}; + +#endif /* _INTERAL_MALLOC_IMPL_H */ diff --git a/runtime/internal-malloc.c b/runtime/internal-malloc.c index 0fadfbd7..33dcfeab 100644 --- a/runtime/internal-malloc.c +++ b/runtime/internal-malloc.c @@ -8,13 +8,15 @@ #include "cilk-internal.h" #include "debug.h" #include "global.h" +#include "local.h" CHEETAH_INTERNAL int cheetah_page_shift = 0; -#define MEM_LIST_SIZE 8 +#define MEM_LIST_SIZE 8U #define INTERNAL_MALLOC_CHUNK_SIZE (32 * 1024) #define SIZE_THRESH bucket_sizes[NUM_BUCKETS - 1] +/* TODO: Use sizeof(fiber), sizeof(closure), etc. */ static const unsigned int bucket_sizes[NUM_BUCKETS] = {32, 64, 128, 256, 512, 1024, 2048}; static const unsigned int bucket_capacity[NUM_BUCKETS] = { @@ -41,6 +43,7 @@ static inline int is_page_aligned(size_t size) { } static inline unsigned int size_to_bucket(size_t size) { + /* TODO: If sizes are powers of 2 use ffs() */ for (unsigned int i = 0; i < NUM_BUCKETS; i++) { if (size <= bucket_sizes[i]) { return i; @@ -50,89 +53,197 @@ static inline unsigned int size_to_bucket(size_t size) { } static inline unsigned int bucket_to_size(int which_bucket) { + /* TODO: 1U << (which_bucket + 5) */ return bucket_sizes[which_bucket]; } -#if CILK_DEBUG || INTERNAL_MALLOC_STATS // used in these cases only -/* compute the length of a free list starting at pointer p */ -static unsigned int free_list_length(void *p) { - unsigned int count = 0; - while (p) { - count++; - // next pointer is stored at the first 8 bytes - p = ((struct free_block *)p)->next; - } - return count; +static void add_to_free_list(struct im_bucket *bucket, void *p) { + ((struct free_block *)p)->next = bucket->free_list; + bucket->free_list = p; + ++bucket->free_list_size; } -#endif -#if INTERNAL_MALLOC_STATS -static inline void init_im_bucket_stats(struct im_bucket_stats *s) { - s->num_free = 0; - s->allocated = 0; - s->max_allocated = 0; +static void *remove_from_free_list(struct im_bucket *bucket) { + void *mem = bucket->free_list; + if (mem) { + bucket->free_list = ((struct free_block *)mem)->next; + --bucket->free_list_size; + } + return mem; } -#else -#define init_im_bucket_stats(s) -#endif /* initialize the buckets in struct cilk_im_desc */ static void init_im_buckets(struct cilk_im_desc *im_desc) { for (int i = 0; i < NUM_BUCKETS; i++) { struct im_bucket *bucket = &(im_desc->buckets[i]); bucket->free_list = NULL; - bucket->count_until_free = bucket_capacity[i]; - init_im_bucket_stats(&bucket->stats); + bucket->free_list_size = 0; + bucket->free_list_limit = bucket_capacity[i]; + bucket->allocated = 0; + bucket->max_allocated = 0; + bucket->wasted = 0; } + im_desc->used = 0; + for (int j = 0; j < IM_NUM_TAGS; ++j) + im_desc->num_malloc[j] = 0; } //========================================================= // Private helper functions for debugging //========================================================= -#if CILK_DEBUG +static void dump_buckets(FILE *out, struct cilk_im_desc *d) { + fprintf(out, " %zd bytes used\n", d->used); + for (unsigned i = 0; i < NUM_BUCKETS; ++i) { + struct im_bucket *b = &d->buckets[i]; + if (!b->free_list && !b->free_list_size && !b->allocated) + continue; + fprintf(out, " [%u] %d allocated (%d max, %zd wasted), %u free\n", + bucket_to_size(i), b->allocated, b->max_allocated, b->wasted, + b->free_list_size); + } +} + +static size_t free_bytes(struct cilk_im_desc *desc) { + size_t free = 0; + for (unsigned i = 0; i < NUM_BUCKETS; ++i) + free += (size_t)desc->buckets[i].free_list_size * bucket_sizes[i]; + return free; +} + +static long wasted_bytes(struct cilk_im_desc *desc) { + long wasted = 0; + for (unsigned i = 0; i < NUM_BUCKETS; ++i) + wasted += desc->buckets[i].wasted; + return wasted; +} + +static size_t workers_used_and_free(global_state *g) { + size_t worker_free = 0; + long worker_used = 0, worker_wasted = 0; + for (unsigned int i = 0; i < g->nworkers; i++) { + __cilkrts_worker *w = g->workers[i]; + if (!w) + continue; /* starting up or shutting down */ + local_state *l = w->l; + worker_free += free_bytes(&l->im_desc); + worker_used += l->im_desc.used; + worker_wasted += wasted_bytes(&l->im_desc); + } + CILK_ASSERT_G(worker_used >= 0 && worker_wasted >= 0); + return worker_used + worker_free + worker_wasted; +} + +CHEETAH_INTERNAL +void dump_memory_state(FILE *out, global_state *g) { + if (out == NULL) + out = stderr; + size_t global_free = free_bytes(&g->im_desc); + ptrdiff_t available = + (char *)g->im_pool.mem_end - (char *)g->im_pool.mem_begin; + fprintf(out, + "Global memory:\n %zu allocated in %u blocks (%zu wasted)\n" + " %zd used + %tu available + %zu free = %zu\n", + g->im_pool.allocated, g->im_pool.mem_list_index + 1, + g->im_pool.wasted, g->im_desc.used, available, global_free, + g->im_desc.used + available + global_free); + dump_buckets(out, &g->im_desc); + for (unsigned int i = 0; i < g->nworkers; i++) { + __cilkrts_worker *w = g->workers[i]; + if (!w) + continue; + fprintf(out, "Worker %u:\n", i); + dump_buckets(out, &w->l->im_desc); + } +} + +void dump_memory_state_stderr(global_state *g) { dump_memory_state(stderr, g); } + CHEETAH_INTERNAL void internal_malloc_global_check(global_state *g) { + /* TODO: Test should be + global used = worker used + free + global used + global free = allocated. */ struct cilk_im_desc *d = &(g->im_desc); - size_t total_size = d->used; - size_t total_malloc = d->num_malloc; - for (unsigned int i = 0; i < g->options.nproc; i++) { - d = &(g->workers[i]->l->im_desc); - total_size += d->used; - total_malloc += d->num_malloc; + size_t total_malloc[IM_NUM_TAGS]; + for (int i = 0; i < IM_NUM_TAGS; ++i) + total_malloc[i] = d->num_malloc[i]; + + for (unsigned int i = 0; i < g->nworkers; i++) { + __cilkrts_worker *w = g->workers[i]; + if (!w) + continue; /* starting up or shutting down */ + local_state *l = w->l; + for (int i = 0; i < IM_NUM_TAGS; ++i) + total_malloc[i] += l->im_desc.num_malloc[i]; } - // these fields must add up to 0, as they keep track of sizes and number of - // malloc / frees going out of / into the global pool / per-worker pool. - // Anything batch-freed into per-worker pool had to come from the global - // pool; similarly, anything batch-allocated out of the per-worker pool gets - // freed into the global one + size_t allocated = g->im_pool.allocated; + CILK_ASSERT_G(g->im_desc.used >= 0); + size_t global_used = g->im_desc.used; + size_t global_free = free_bytes(&g->im_desc); + size_t worker_total = workers_used_and_free(g); + size_t global_available = + (char *)g->im_pool.mem_end - (char *)g->im_pool.mem_begin; + + if (global_used != worker_total || + global_used + global_free + global_available != allocated) + dump_memory_state(stderr, g); + + CILK_CHECK(g, + global_used + global_free + global_available == allocated && + global_used == worker_total, + "Possible memory leak: %zu+%zu+%zu global used+free+available, " + "%zu allocated, %zu in workers", + global_used, global_free, global_available, allocated, + worker_total); +} - CILK_CHECK(g, (total_size == 0) && (total_malloc == 0), - "Possible memory leak detected"); +static void assert_global_pool(struct global_im_pool *pool) { + CILK_ASSERT_G(pool->mem_list_index < pool->mem_list_size); + if (pool->wasted > 0) + CILK_ASSERT_G(pool->wasted < pool->allocated); } -#else -#define internal_malloc_global_check(g) -#endif // CILK_DEBUG +static void assert_bucket(struct im_bucket *bucket) { + CILK_ASSERT_G(!!bucket->free_list == !!bucket->free_list_size); + CILK_ASSERT_G_LE(bucket->free_list_size, bucket->free_list_limit, "%u"); + CILK_ASSERT_G_LE(bucket->allocated, bucket->max_allocated, "%d"); +} //========================================================= // Private helper functions for IM stats //========================================================= -#if INTERNAL_MALLOC_STATS -static void init_global_im_pool_stats(struct global_im_pool_stats *stats) { - stats->allocated = 0; - stats->wasted = 0; -} - -static void print_im_buckets_stats(struct global_state *g) { - #define HDR_DESC "%15s" #define WORKER_HDR_DESC "%10s %3u:" // two char short compared to HDR_DESC #define FIELD_DESC "%10zu" + +static void print_worker_buckets_free(__cilkrts_worker *w, void *data) { + FILE *fp = (FILE *)data; + local_state *l = w->l; + fprintf(fp, WORKER_HDR_DESC, "Worker", w->self); + for (unsigned int j = 0; j < NUM_BUCKETS; j++) { + fprintf(fp, FIELD_DESC, + (size_t)l->im_desc.buckets[j].free_list_size * bucket_sizes[j]); + } + fprintf(fp, "\n"); +} + +static void print_worker_buckets_hwm(__cilkrts_worker *w, void *data) { + FILE *fp = (FILE *)data; + local_state *l = w->l; + fprintf(fp, WORKER_HDR_DESC, "Worker", w->self); + for (unsigned int j = 0; j < NUM_BUCKETS; j++) { + fprintf(fp, FIELD_DESC, + (size_t)l->im_desc.buckets[j].max_allocated * bucket_sizes[j]); + } + fprintf(fp, "\n"); +} + +static void print_im_buckets_stats(struct global_state *g) { fprintf(stderr, "\nBYTES IN FREE LISTS:\n"); fprintf(stderr, HDR_DESC, "Bucket size:"); for (int j = 0; j < NUM_BUCKETS; j++) { @@ -143,19 +254,11 @@ static void print_im_buckets_stats(struct global_state *g) { fprintf(stderr, HDR_DESC, "Global:"); for (unsigned int j = 0; j < NUM_BUCKETS; j++) { - struct im_bucket_stats *s = &(g->im_desc.buckets[j].stats); - fprintf(stderr, FIELD_DESC, (size_t_t)s->num_free * bucket_sizes[j]); + fprintf(stderr, FIELD_DESC, + (size_t)g->im_desc.buckets[j].free_list_size * bucket_sizes[j]); } fprintf(stderr, "\n"); - for (unsigned int i = 0; i < g->options.nproc; i++) { - __cilkrts_worker *w = g->workers[i]; - fprintf(stderr, WORKER_HDR_DESC, "Worker", w->self); - for (unsigned int j = 0; j < NUM_BUCKETS; j++) { - struct im_bucket_stats *s = &(w->l->im_desc.buckets[j].stats); - fprintf(stderr, FIELD_DESC, (size_t)s->num_free * bucket_sizes[j]); - } - fprintf(stderr, "\n"); - } + for_each_worker(g, &print_worker_buckets_free, stderr); fprintf(stderr, "\nHIGH WATERMARK FOR BYTES ALLOCATED:\n"); fprintf(stderr, HDR_DESC, "Bucket size:"); @@ -164,36 +267,23 @@ static void print_im_buckets_stats(struct global_state *g) { } fprintf(stderr, "\n-------------------------------------------" "---------------------------------------------\n"); + for_each_worker(g, &print_worker_buckets_hwm, stderr); - for (unsigned int i = 0; i < g->options.nproc; i++) { - __cilkrts_worker *w = g->workers[i]; - fprintf(stderr, WORKER_HDR_DESC, "Worker", w->self); - for (unsigned int j = 0; j < NUM_BUCKETS; j++) { - struct im_bucket_stats *s = &(w->l->im_desc.buckets[j].stats); - fprintf(stderr, FIELD_DESC, - (size_t)s->max_allocated * bucket_sizes[j]); - } - fprintf(stderr, "\n"); - } fprintf(stderr, "\n"); } -static void print_global_im_pool_stats(struct global_im_pool_stats *stats) { +static void print_internal_malloc_stats(struct global_state *g) { + unsigned page_size = 1U << cheetah_page_shift; + fprintf(stderr, "\nINTERNAL MALLOC STATS\n"); fprintf(stderr, "Total bytes allocated from system: %7zu KBytes (%zu pages)\n", - stats->allocated / 1024, - (stats->allocated + PAGE_SIZE - 1) / PAGE_SIZE); + g->im_pool.allocated / 1024, + (g->im_pool.allocated + page_size - 1) / page_size); fprintf(stderr, "Total bytes allocated but wasted: %7zu KBytes\n", - stats->wasted / 1024); -} - -static void print_internal_malloc_stats(struct global_state *g) { - fprintf(stderr, "\nINTERNAL MALLOC STATS\n"); - print_global_im_pool_stats(&(g->im_pool.stats)); + g->im_pool.wasted / 1024); print_im_buckets_stats(g); fprintf(stderr, "\n"); } -#endif // INTERNAL_MALLOC_STATS //========================================================= // Global memory allocator @@ -207,7 +297,7 @@ static char *malloc_from_system(__cilkrts_worker *w, size_t size) { } else { mem = malloc(size); } - CILK_CHECK(w->g, mem, "Internal malloc running out of memory!"); + CILK_CHECK(w->g, mem, "Internal malloc failed to allocate %zu bytes", size); return mem; } @@ -229,9 +319,7 @@ static void extend_global_pool(__cilkrts_worker *w) { struct global_im_pool *im_pool = &(w->g->im_pool); im_pool->mem_begin = malloc_from_system(w, INTERNAL_MALLOC_CHUNK_SIZE); im_pool->mem_end = im_pool->mem_begin + INTERNAL_MALLOC_CHUNK_SIZE; -#if INTERNAL_MALLOC_STATS - im_pool->stats.allocated += INTERNAL_MALLOC_CHUNK_SIZE; -#endif + im_pool->allocated += INTERNAL_MALLOC_CHUNK_SIZE; im_pool->mem_list_index++; if (im_pool->mem_list_index >= im_pool->mem_list_size) { @@ -240,7 +328,8 @@ static void extend_global_pool(__cilkrts_worker *w) { new_list_size * sizeof(*im_pool->mem_list)); im_pool->mem_list_size = new_list_size; CILK_CHECK(w->g, im_pool->mem_list, - "Interal malloc running out of memory!"); + "Failed to extend global memory list by %zu bytes", + MEM_LIST_SIZE * sizeof(*im_pool->mem_list)); } im_pool->mem_list[im_pool->mem_list_index] = im_pool->mem_begin; } @@ -252,31 +341,24 @@ static void extend_global_pool(__cilkrts_worker *w) { */ static void *global_im_alloc(__cilkrts_worker *w, size_t size, unsigned int which_bucket) { - - CILK_ASSERT(w, w->g); + global_state *g = w->g; + CILK_ASSERT(w, g); CILK_ASSERT(w, size <= SIZE_THRESH); CILK_ASSERT(w, which_bucket < NUM_BUCKETS); - struct im_bucket *bucket = &(w->g->im_desc.buckets[which_bucket]); - void *mem = bucket->free_list; + struct im_bucket *bucket = &(g->im_desc.buckets[which_bucket]); + struct cilk_im_desc *im_desc = &(g->im_desc); + im_desc->used += size; + /* ??? count calls to this function? */ - WHEN_CILK_DEBUG({ // stats only kept track during debugging - struct cilk_im_desc *im_desc = &(w->g->im_desc); - im_desc->used += size; - im_desc->num_malloc++; - }); - // look at the global free list for this bucket - if (mem) { - bucket->free_list = ((struct free_block *)mem)->next; - bucket->count_until_free++; - } else { - struct global_im_pool *im_pool = &(w->g->im_pool); + void *mem = remove_from_free_list(bucket); + if (!mem) { + struct global_im_pool *im_pool = &(g->im_pool); // allocate from the global pool if ((im_pool->mem_begin + size) > im_pool->mem_end) { -#if INTERNAL_MALLOC_STATS // consider the left over as waste for now - im_pool->stats.wasted += im_pool->mem_end - im_pool->mem_begin; -#endif + // TODO: Adding it to a random free list would be better. + im_pool->wasted += im_pool->mem_end - im_pool->mem_begin; extend_global_pool(w); } mem = im_pool->mem_begin; @@ -286,34 +368,9 @@ static void *global_im_alloc(__cilkrts_worker *w, size_t size, return mem; } -/** - * Free a piece of memory of 'size' back to global im bucket 'bucket'. - * The free_list is last-in-first-out. - * The size is already canonicalized at this point. - */ -static void global_im_free(__cilkrts_worker *w, void *p, size_t size, - unsigned int which_bucket) { - - CILK_ASSERT(w, w->g); - CILK_ASSERT(w, size <= SIZE_THRESH); - CILK_ASSERT(w, which_bucket < NUM_BUCKETS); - USE_UNUSED(size); - - WHEN_CILK_DEBUG({ // stats only kept track during debugging - struct cilk_im_desc *im_desc = &(w->g->im_desc); - im_desc->used -= size; - im_desc->num_malloc--; - }); - struct im_bucket *bucket = &(w->g->im_desc.buckets[which_bucket]); - void *next = bucket->free_list; - ((struct free_block *)p)->next = next; - bucket->free_list = p; - bucket->count_until_free--; -} - static void global_im_pool_destroy(struct global_im_pool *im_pool) { - for (int i = 0; i < im_pool->mem_list_size; i++) { + for (unsigned i = 0; i < im_pool->mem_list_size; i++) { void *mem = im_pool->mem_list[i]; free_to_system(mem, INTERNAL_MALLOC_CHUNK_SIZE); im_pool->mem_list[i] = NULL; @@ -336,27 +393,32 @@ void cilk_internal_malloc_global_init(global_state *g) { g->im_pool.mem_begin = g->im_pool.mem_end = NULL; g->im_pool.mem_list_index = -1; g->im_pool.mem_list_size = MEM_LIST_SIZE; - g->im_pool.mem_list = malloc(MEM_LIST_SIZE * sizeof(*g->im_pool.mem_list)); - CILK_CHECK(g, g->im_pool.mem_list, "Cannot allocate mem_list"); + g->im_pool.mem_list = calloc(MEM_LIST_SIZE, sizeof(*g->im_pool.mem_list)); + CILK_CHECK(g, g->im_pool.mem_list, + "Cannot allocate %u * %zu bytes for mem_list", MEM_LIST_SIZE, + sizeof(*g->im_pool.mem_list)); + g->im_pool.allocated = 0; + g->im_pool.wasted = 0; init_im_buckets(&g->im_desc); - WHEN_IM_STATS(init_global_im_pool_stats(&(g->im_pool.stats))); - WHEN_CILK_DEBUG(g->im_desc.used = 0); - WHEN_CILK_DEBUG(g->im_desc.num_malloc = 0); + + g->im_desc.used = 0; + for (int i = 0; i < IM_NUM_TAGS; ++i) + g->im_desc.num_malloc[i] = 0; } void cilk_internal_malloc_global_terminate(global_state *g) { -#if INTERNAL_MALLOC_STATS - for (unsigned int i = 0; i < NUM_BUCKETS; i++) { - struct im_bucket *b = &(g->im_desc.buckets[i]); - b->stats.num_free = free_list_length(b->free_list); - } - print_internal_malloc_stats(g); -#endif + if (DEBUG_ENABLED(MEMORY)) + internal_malloc_global_check(g); + if (ALERT_ENABLED(MEMORY)) + print_internal_malloc_stats(g); } void cilk_internal_malloc_global_destroy(global_state *g) { global_im_pool_destroy(&(g->im_pool)); // free global mem blocks cilk_mutex_destroy(&(g->im_lock)); + for (int i = 0; i < IM_NUM_TAGS; ++i) { + CILK_ASSERT_G(g->im_desc.num_malloc[i] == 0); + } } //========================================================= @@ -368,22 +430,21 @@ void cilk_internal_malloc_global_destroy(global_state *g) { * into per-worker im bucket 'bucket'. */ static void im_allocate_batch(__cilkrts_worker *w, size_t size, - unsigned int bucket) { - - unsigned int batch_size = bucket_capacity[bucket] / 2; - cilk_mutex_lock(&(w->g->im_lock)); + unsigned int bucket_index) { + global_state *g = w->g; + local_state *l = w->l; + struct im_bucket *bucket = &l->im_desc.buckets[bucket_index]; + unsigned int batch_size = bucket_capacity[bucket_index] / 2; + cilk_mutex_lock(&(g->im_lock)); for (unsigned int i = 0; i < batch_size; i++) { - void *p = global_im_alloc(w, size, bucket); - cilk_internal_free(w, p, size); + void *p = global_im_alloc(w, size, bucket_index); + add_to_free_list(bucket, p); } - cilk_mutex_unlock(&(w->g->im_lock)); -#if INTERNAL_MALLOC_STATS - struct im_bucket_stats *s = &(w->l->im_desc.buckets[bucket].stats); - s->allocated += batch_size; - if (s->allocated > s->max_allocated) { - s->max_allocated = s->allocated; + cilk_mutex_unlock(&(g->im_lock)); + bucket->allocated += batch_size; + if (bucket->allocated > bucket->max_allocated) { + bucket->max_allocated = bucket->allocated; } -#endif } /** @@ -391,19 +452,22 @@ static void im_allocate_batch(__cilkrts_worker *w, size_t size, * back to global im bucket 'bucket'. */ static void im_free_batch(__cilkrts_worker *w, size_t size, - unsigned int bucket) { - - unsigned int batch_size = bucket_capacity[bucket] / 2; - cilk_mutex_lock(&(w->g->im_lock)); - for (unsigned int i = 0; i < batch_size; i++) { - void *p = cilk_internal_malloc(w, size); - global_im_free(w, p, size, bucket); + unsigned int which_bucket) { + global_state *g = w->g; + local_state *l = w->l; + unsigned int batch_size = bucket_capacity[which_bucket] / 2; + struct im_bucket *bucket = &(l->im_desc.buckets[which_bucket]); + cilk_mutex_lock(&(g->im_lock)); + for (unsigned int i = 0; i < batch_size; ++i) { + void *mem = remove_from_free_list(bucket); + if (!mem) + break; + add_to_free_list(&g->im_desc.buckets[which_bucket], mem); + g->im_desc.used -= size; + --bucket->allocated; } cilk_mutex_unlock(&(w->g->im_lock)); -#if INTERNAL_MALLOC_STATS - struct im_bucket_stats *s = &(w->l->im_desc.buckets[bucket].stats); - s->allocated -= batch_size; -#endif + /* Account for bytes allocated change? */ } /* @@ -411,58 +475,80 @@ static void im_free_batch(__cilkrts_worker *w, size_t size, * last-in-first-out */ CHEETAH_INTERNAL -void *cilk_internal_malloc(__cilkrts_worker *w, size_t size) { - - WHEN_CILK_DEBUG(w->l->im_desc.used += size); - WHEN_CILK_DEBUG(w->l->im_desc.num_malloc += 1); - - if (size >= SIZE_THRESH) { +void *cilk_internal_malloc(__cilkrts_worker *w, size_t size, enum im_tag tag) { + local_state *l = w->l; + unsigned int which_bucket = size_to_bucket(size); + if (which_bucket >= NUM_BUCKETS) { return malloc_from_system(w, size); } + if (ALERT_ENABLED(MEMORY)) + fprintf(stderr, "[W%d] alloc %zu tag %d\n", w->self, size, (int)tag); + + l->im_desc.used += size; + l->im_desc.num_malloc[tag] += 1; - unsigned int which_bucket = size_to_bucket(size); - CILK_ASSERT(w, which_bucket >= 0 && which_bucket < NUM_BUCKETS); unsigned int csize = bucket_to_size(which_bucket); // canonicalize the size - struct im_bucket *bucket = &(w->l->im_desc.buckets[which_bucket]); - void *mem = bucket->free_list; + struct im_bucket *bucket = &(l->im_desc.buckets[which_bucket]); + bucket->wasted += csize - size; + void *mem = remove_from_free_list(bucket); if (!mem) { // when out of memory, allocate a batch from global pool im_allocate_batch(w, csize, which_bucket); - mem = bucket->free_list; + mem = remove_from_free_list(bucket); + CILK_ASSERT(w, mem); } - - /* if there is a block in the free list */ - CILK_ASSERT(w, mem); - bucket->free_list = ((struct free_block *)mem)->next; - bucket->count_until_free++; - + if (ALERT_ENABLED(MEMORY)) + dump_memory_state(NULL, w->g); +#if 0 /* race condition if workers are running */ + if (DEBUG_ENABLED(MEMORY_SLOW)) + internal_malloc_global_check(w->g); +#endif return mem; } /* * Free simply returns to the free list; last-in-first-out */ -void cilk_internal_free(__cilkrts_worker *w, void *p, size_t size) { - - WHEN_CILK_DEBUG(w->l->im_desc.used -= size); - WHEN_CILK_DEBUG(w->l->im_desc.num_malloc -= 1); - +void cilk_internal_free(__cilkrts_worker *w, void *p, size_t size, + enum im_tag tag) { if (size > SIZE_THRESH) { free_to_system(p, size); return; } + if (ALERT_ENABLED(MEMORY)) + fprintf(stderr, "[W%d] free %zu tag %d\n", w->self, size, (int)tag); + + local_state *l = w->l; + l->im_desc.used -= size; + l->im_desc.num_malloc[tag] -= 1; unsigned int which_bucket = size_to_bucket(size); CILK_ASSERT(w, which_bucket >= 0 && which_bucket < NUM_BUCKETS); unsigned int csize = bucket_to_size(which_bucket); // canonicalize the size - struct im_bucket *bucket = &(w->l->im_desc.buckets[which_bucket]); + struct im_bucket *bucket = &(l->im_desc.buckets[which_bucket]); + bucket->wasted -= csize - size; - while (bucket->count_until_free <= 0) { + add_to_free_list(bucket, p); + + while (bucket->free_list_size > bucket->free_list_limit) { im_free_batch(w, csize, which_bucket); } - ((struct free_block *)p)->next = bucket->free_list; - bucket->free_list = p; - bucket->count_until_free--; + if (ALERT_ENABLED(MEMORY)) + dump_memory_state(NULL, w->g); +#if 0 /* not safe with multiple workers */ + if (DEBUG_ENABLED(MEMORY_SLOW)) + internal_malloc_global_check(w->g); +#endif +} + +/* This function is called after workers have terminated. + It has no locking. */ +void cilk_internal_free_global(global_state *g, void *p, size_t size, + enum im_tag tag) { + unsigned int which_bucket = size_to_bucket(size); + add_to_free_list(&g->im_desc.buckets[which_bucket], p); + g->im_desc.num_malloc[tag]--; + g->im_desc.used -= bucket_to_size(which_bucket); } void cilk_internal_malloc_per_worker_init(__cilkrts_worker *w) { @@ -470,20 +556,49 @@ void cilk_internal_malloc_per_worker_init(__cilkrts_worker *w) { } void cilk_internal_malloc_per_worker_terminate(__cilkrts_worker *w) { -#if INTERNAL_MALLOC_STATS + global_state *g = w->g; /* Global state is locked by caller. */ + local_state *l = w->l; + assert_global_pool(&g->im_pool); + if (DEBUG_ENABLED(MEMORY_SLOW)) + internal_malloc_global_check(g); for (unsigned int i = 0; i < NUM_BUCKETS; i++) { - struct im_bucket *b = &(w->l->im_desc.buckets[i]); - b->stats.num_free = free_list_length(b->free_list); + assert_bucket(&l->im_desc.buckets[i]); + while (l->im_desc.buckets[i].free_list) + im_free_batch(w, bucket_to_size(i), i); } -#endif + for (int i = 0; i < IM_NUM_TAGS; ++i) { + g->im_desc.num_malloc[i] += l->im_desc.num_malloc[i]; + l->im_desc.num_malloc[i] = 0; + } + if (ALERT_ENABLED(MEMORY)) + dump_memory_state(NULL, w->g); + /* This check is safe because all worker threads have exited. */ + if (DEBUG_ENABLED(MEMORY_SLOW)) + internal_malloc_global_check(g); } void cilk_internal_malloc_per_worker_destroy(__cilkrts_worker *w) { -#if CILK_DEBUG + /* The main closure and fiber have not yet been destroyed. They are + allocated with system malloc instead of internal malloc. */ + local_state *l = w->l; for (unsigned int i = 0; i < NUM_BUCKETS; i++) { - struct im_bucket *bucket = &(w->l->im_desc.buckets[i]); - unsigned int k = free_list_length(bucket->free_list); - CILK_ASSERT(w, (bucket->count_until_free + k) == bucket_capacity[i]); + CILK_ASSERT_INDEX_ZERO(w, l->im_desc.buckets, i, .free_list_size, "%u"); + CILK_ASSERT_INDEX_ZERO(w, l->im_desc.buckets, i, .free_list, "%p"); + /* allocated may be nonzero due to memory migration */ + } +} + +const char *name_for_im_tag(enum im_tag tag) { + switch (tag) { + case IM_UNCLASSIFIED: + return "unclassified"; + case IM_CLOSURE: + return "closure"; + case IM_FIBER: + return "fiber"; + case IM_REDUCER_MAP: + return "reducer map"; + default: + return "unknown"; } -#endif } diff --git a/runtime/internal-malloc.h b/runtime/internal-malloc.h index b2bf8db8..2bde0900 100644 --- a/runtime/internal-malloc.h +++ b/runtime/internal-malloc.h @@ -4,57 +4,19 @@ #include #include -#include "debug.h" #include "rts-config.h" CHEETAH_INTERNAL extern int cheetah_page_shift; -#define NUM_BUCKETS 7 - -#define INTERNAL_MALLOC_STATS CILK_STATS - -struct global_im_pool_stats { - size_t allocated; // bytes allocated into the pool - size_t wasted; // bytes at the end of a chunk that could not be used -}; - -struct im_bucket_stats { - unsigned int num_free; // number of free blocks left; computed at terminate - unsigned int allocated; // number of batch allocated and not freed - unsigned int max_allocated; // high watermark of batch_allocated +enum im_tag { + IM_UNCLASSIFIED, + IM_CLOSURE, + IM_FIBER, + IM_REDUCER_MAP, + IM_NUM_TAGS }; -#if INTERNAL_MALLOC_STATS -#define WHEN_IM_STATS(ex) ex -#else -#define WHEN_IM_STATS(ex) -#endif -/* struct for managing global memory pool; each memory block in mem_list starts - out with size INTERMAL_MALLOC_CHUNK. We will allocate small pieces off the - memory block and free the pieces into per-worker im_descriptor free list. */ -struct global_im_pool { - char *mem_begin; // beginning of the free memory block that we are using - char *mem_end; // end of the free memory block that we are using - char **mem_list; // list of memory blocks obtained from system - int mem_list_index; // index to the current mem block in use - int mem_list_size; // length of the mem_list - WHEN_IM_STATS(struct global_im_pool_stats stats); // im pool stats -}; - -struct im_bucket { - void *free_list; // beginning of free list - unsigned int list_size; // length of free list - unsigned int - count_until_free; // number of allocations to make on the free list - // before calling batch_free (back to the global) - WHEN_IM_STATS(struct im_bucket_stats stats); -}; - -struct cilk_im_desc { - struct im_bucket buckets[NUM_BUCKETS]; - size_t used; - unsigned long num_malloc; -}; +CHEETAH_INTERNAL const char *name_for_im_tag(enum im_tag); /* Custom implementation of aligned_alloc. */ static inline void *cilk_aligned_alloc(size_t alignment, size_t size) { @@ -69,6 +31,7 @@ static inline void *cilk_aligned_alloc(size_t alignment, size_t size) { // public functions (external to source file, internal to library) CHEETAH_INTERNAL void cilk_internal_malloc_global_init(struct global_state *g); +CHEETAH_INTERNAL void internal_malloc_global_check(global_state *g); CHEETAH_INTERNAL void cilk_internal_malloc_global_terminate(struct global_state *g); CHEETAH_INTERNAL void @@ -80,8 +43,11 @@ CHEETAH_INTERNAL void cilk_internal_malloc_per_worker_terminate(__cilkrts_worker *w); __attribute__((alloc_size(2), assume_aligned(32), malloc)) CHEETAH_INTERNAL void * -cilk_internal_malloc(__cilkrts_worker *w, size_t size); +cilk_internal_malloc(__cilkrts_worker *w, size_t size, enum im_tag tag); CHEETAH_INTERNAL void cilk_internal_free(__cilkrts_worker *w, void *p, - size_t size); + size_t size, enum im_tag tag); +/* Release memory to the global pool after workers have stopped. */ +CHEETAH_INTERNAL void cilk_internal_free_global(struct global_state *, void *p, + size_t size, enum im_tag tag); #endif // _INTERAL_MALLOC_H diff --git a/runtime/invoke-main.c b/runtime/invoke-main.c deleted file mode 100644 index 9fc3f537..00000000 --- a/runtime/invoke-main.c +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include - -#include "cilk-internal.h" -#include "cilk2c.h" -#include "fiber.h" -#include "global.h" -#include "init.h" -#include "scheduler.h" - -extern unsigned long ZERO; - -CHEETAH_INTERNAL Closure *create_invoke_main(global_state *const g) { - - Closure *t; - __cilkrts_stack_frame *sf; - struct cilk_fiber *fiber; - - t = Closure_create_main(); - Closure_make_ready(t); - - cilkrts_alert(ALERT_BOOT, NULL, "(create_invoke_main) invoke_main = %p", t); - - sf = malloc(sizeof(*sf)); - fiber = cilk_main_fiber_allocate(); - - // it's important to set the following fields for the root closure, - // because we use the info to jump to the right stack position and start - // executing user code. For any other frames, these fields get setup - // in user code before a spawn and when it gets stolen the first time. - void *new_rsp = (void *)sysdep_reset_stack_for_resume(fiber, sf); - CILK_ASSERT_G(SP(sf) == new_rsp); - FP(sf) = new_rsp; - PC(sf) = (void *)invoke_main; - - sf->flags = 0; - sf->magic = g->frame_magic; - __cilkrts_set_stolen(sf); - // FIXME - sf->flags |= CILK_FRAME_DETACHED; - - t->frame = sf; - sf->worker = (struct __cilkrts_worker *)0xbfbfbfbfbf; - t->fiber = fiber; - // WHEN_CILK_DEBUG(sf->magic = CILK_STACKFRAME_MAGIC); - - cilkrts_alert(ALERT_BOOT, NULL, - "(create_invoke_main) invoke_main->fiber = %p", fiber); - - return t; -} - -CHEETAH_INTERNAL void cleanup_invoke_main(Closure *invoke_main) { - cilk_main_fiber_deallocate(invoke_main->fiber); - free(invoke_main->frame); - Closure_destroy_main(invoke_main); -} - -CHEETAH_INTERNAL void spawn_cilk_main(volatile atomic_int *res, int argc, - char *args[]) { - __cilkrts_stack_frame *sf = alloca(sizeof(__cilkrts_stack_frame)); - __cilkrts_enter_frame_fast(sf); - sf->flags |= CILK_FRAME_LAST; - __cilkrts_detach(sf); - /* Make this an atomic so the store is completed before done is set true. */ - atomic_store_explicit(res, cilk_main(argc, args), memory_order_relaxed); - __cilkrts_pop_frame(sf); - __cilkrts_leave_frame(sf); -} - -/* - * ANGE: strictly speaking, we could just call cilk_main instead of spawn, - * but spawning has a couple advantages: - * - it allow us to do tlmm_set_closure_stack_mapping in a natural way - for the invoke_main closure (otherwise would need to setup it specially). - * - the sync point after spawn of cilk_main provides a natural point to - * resume if user ever calls Cilk_exit and abort the entire computation. - */ -CHEETAH_INTERNAL_NORETURN void invoke_main() { - - __cilkrts_worker *w = __cilkrts_get_tls_worker(); - __cilkrts_stack_frame *sf = w->current_stack_frame; - - for (unsigned i = 0; i < cilkrts_callbacks.last_init; ++i) - cilkrts_callbacks.init[i](); - - /* Any attempt to register more initializers should fail. */ - cilkrts_callbacks.after_init = true; - - char *rsp; - char *nsp; - int argc = w->g->cilk_main_argc; - char **args = w->g->cilk_main_args; - - ASM_GET_SP(rsp); - cilkrts_alert(ALERT_BOOT, w, "invoke_main rsp = %p", rsp); - - /* TODO(jfc): This could be optimized out by link time optimization. */ - alloca(cilkrts_zero); - - __cilkrts_save_fp_ctrl_state(sf); - if (__builtin_setjmp(sf->ctx) == 0) { - /* JFC: This code originally stored to a temporary variable - that was later stored to cilk_main_return. llvm's optimizer - was overly clever and lost the value. */ - spawn_cilk_main(&w->g->cilk_main_return, argc, args); - } else { - // ANGE: Important to reset using sf->worker; - // otherwise w gets cached in a register - w = sf->worker; - cilkrts_alert(ALERT_BOOT, w, - "invoke_main corrected worker after spawn"); - } - - ASM_GET_SP(nsp); - cilkrts_alert(ALERT_BOOT, w, "invoke_main new rsp = %p", nsp); - - CILK_ASSERT_G(w == __cilkrts_get_tls_worker()); - - if (__cilkrts_unsynced(sf)) { - __cilkrts_save_fp_ctrl_state(sf); - if (__builtin_setjmp(sf->ctx) == 0) { - __cilkrts_sync(sf); - } else { - // ANGE: Important to reset using sf->worker; - // otherwise w gets cached in a register - w = sf->worker; - cilkrts_alert(ALERT_BOOT, w, - "invoke_main corrected worker after sync"); - } - } - - CILK_ASSERT_G(w == __cilkrts_get_tls_worker()); - // WHEN_CILK_DEBUG(sf->magic = ~CILK_STACKFRAME_MAGIC); - - for (unsigned i = cilkrts_callbacks.last_exit; i > 0;) - cilkrts_callbacks.exit[--i](); - - /* Allow registering new init callbacks again. */ - cilkrts_callbacks.after_init = false; - - atomic_store_explicit(&w->g->done, 1, memory_order_release); - - // done; go back to runtime - longjmp_to_runtime(w); -} - -static void main_thread_init(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, - "(main_thread_init) Setting up main thread's closure"); - - g->invoke_main = create_invoke_main(g); - // Make sure all previous stores precede this one. - atomic_store_explicit(&g->start, 1, memory_order_release); -} - -static void threads_join(global_state *g) { - for (unsigned int i = 0; i < g->options.nproc; i++) { - int status = pthread_join(g->threads[i], NULL); - if (status != 0) - cilkrts_bug(NULL, "Cilk runtime error: thread join (%u) failed: %d", - i, status); - } - cilkrts_alert(ALERT_BOOT, NULL, "(threads_join) All workers joined!"); -} - -static void __cilkrts_run(global_state *g) { - main_thread_init(g); - threads_join(g); -} - -static void __cilkrts_exit(global_state *g) { __cilkrts_cleanup(g); } - -#undef main -int main(int argc, char *argv[]) { - int ret; - - global_state *g = __cilkrts_init(argc, argv); - cilkrts_alert(ALERT_START, NULL, - "Cheetah: invoking user main with %d workers", - g->options.nproc); - - __cilkrts_run(g); - /* The store to cilk_main_return precedes the release store to done. - An acquire load from done precedes the load below. */ - ret = atomic_load_explicit(&g->cilk_main_return, memory_order_relaxed); - __cilkrts_exit(g); - - return ret; -} diff --git a/runtime/jmpbuf.h b/runtime/jmpbuf.h index 73b56827..91123e98 100644 --- a/runtime/jmpbuf.h +++ b/runtime/jmpbuf.h @@ -31,9 +31,9 @@ typedef void *jmpbuf[JMPBUF_SIZE]; /* These macros are only for debugging. */ #if defined __i386__ -#define ASM_GET_SP(osp) asm volatile("movl %%esp, %0" : "=r"(osp)) +#define ASM_GET_SP(osp) __asm__ volatile("movl %%esp, %0" : "=r"(osp)) #elif defined __x86_64__ -#define ASM_GET_SP(osp) asm volatile("movq %%rsp, %0" : "=r"(osp)) +#define ASM_GET_SP(osp) __asm__ volatile("movq %%rsp, %0" : "=r"(osp)) #else #define ASM_GET_SP(osp) (osp) = 0 #endif diff --git a/runtime/local.h b/runtime/local.h new file mode 100644 index 00000000..1e6cb40f --- /dev/null +++ b/runtime/local.h @@ -0,0 +1,21 @@ +#ifndef _CILK_LOCAL_H +#define _CILK_LOCAL_H + +#include + +struct local_state { + struct __cilkrts_stack_frame **shadow_stack; + + unsigned short state; /* __cilkrts_worker_state */ + bool lock_wait; + bool provably_good_steal; + unsigned int rand_next; + + jmpbuf rts_ctx; + struct cilk_fiber_pool fiber_pool; + struct cilk_im_desc im_desc; + struct cilk_fiber *fiber_to_free; + struct sched_stats stats; +}; + +#endif /* _CILK_LOCAL_H */ diff --git a/runtime/pedigree_globals.c b/runtime/pedigree_globals.c new file mode 100644 index 00000000..5c17960d --- /dev/null +++ b/runtime/pedigree_globals.c @@ -0,0 +1,44 @@ +#include +#include +#include +#define ENABLE_CILKRTS_PEDIGREE +#include + +__cilkrts_pedigree cilkrts_root_pedigree_node; +uint64_t DPRNG_PRIME = (uint64_t)(-59); +uint64_t* dprng_m_array; +uint64_t dprng_m_X = 0; + +uint64_t __cilkrts_dprng_swap_halves(uint64_t x) { + return (x >> (4 * sizeof(uint64_t))) | (x << (4 * sizeof(uint64_t))); +} + +uint64_t __cilkrts_dprng_mix(uint64_t x) { + for (int i = 0; i < 4; i++) { + x = x * (2*x+1); + x = __cilkrts_dprng_swap_halves(x); + } + return x; +} + +uint64_t __cilkrts_dprng_mix_mod_p(uint64_t x) { + x = __cilkrts_dprng_mix(x); + return x - (DPRNG_PRIME & -(x >= DPRNG_PRIME)); +} + +uint64_t __cilkrts_dprng_sum_mod_p(uint64_t a, uint64_t b) { + uint64_t z = a+b; + if ((z < a) || (z >= DPRNG_PRIME)) { + z -= DPRNG_PRIME; + } + return z; +} + +void __cilkrts_init_dprng(void) { + dprng_m_array = (uint64_t*) malloc(sizeof(uint64_t*) * 4096); + for (int i = 0; i < 4096; i++) { + dprng_m_array[i] = __cilkrts_dprng_mix_mod_p(0x8c679c168e6bf733ul + i); + } + dprng_m_X = __cilkrts_dprng_mix_mod_p(0x8c679c168e6bf733ul + 4096); +} + diff --git a/runtime/personality.c b/runtime/personality.c index dc0cbf61..fcca88a6 100644 --- a/runtime/personality.c +++ b/runtime/personality.c @@ -51,9 +51,9 @@ _Unwind_Reason_Code __cilk_personality_internal( return std_lib_personality(version, actions, exception_class, ue_header, context); } else if (actions & _UA_CLEANUP_PHASE) { - cilkrts_alert(ALERT_EXCEPT, sf->worker, - "cilk_personality called %p CFA %p\n", sf, - (char *)get_cfa(context)); + cilkrts_alert(EXCEPT, sf->worker, + "cilk_personality called %p CFA %p\n", (void *)sf, + (void *)get_cfa(context)); if (sf->flags & CILK_FRAME_UNSYNCHED) { // save floating point state @@ -96,9 +96,9 @@ _Unwind_Reason_Code __cilk_personality_internal( t->reraise_cfa = NULL; if (t->user_exn.exn != NULL && t->user_exn.exn != (char *)ue_header) { - cilkrts_alert(ALERT_EXCEPT, sf->worker, + cilkrts_alert(EXCEPT, sf->worker, "cilk_personality calling RaiseException %p\n", - sf->worker->self, sf); + (void *)sf); // Remember the CFA from which we raised the new exception. t->reraise_cfa = (char *)get_cfa(context); diff --git a/runtime/readydeque.c b/runtime/readydeque.c index 68f889f7..c8f49c42 100644 --- a/runtime/readydeque.c +++ b/runtime/readydeque.c @@ -2,6 +2,7 @@ #include "closure.h" #include "debug.h" #include "global.h" +#include "local.h" /********************************************************* * Management of ReadyDeques @@ -96,7 +97,12 @@ Closure *deque_peek_top(__cilkrts_worker *const w, worker_id pn) { /* ANGE: return the top but does not unlink it from the rest */ cl = w->g->deques[pn].top; if (cl) { - CILK_ASSERT(w, cl->owner_ready_deque == pn); + // If w is stealing, then it may peek the top of the deque of the worker + // who is in the midst of exiting a Cilkified region. In that case, cl + // will be the root closure, and cl->owner_ready_deque is not + // necessarily pn. The steal will subsequently fail do_dekker_on. + CILK_ASSERT(w, cl->owner_ready_deque == pn || + (w->self != pn && cl == w->g->root_closure)); } else { CILK_ASSERT(w, w->g->deques[pn].bottom == (Closure *)NULL); } @@ -176,24 +182,3 @@ void deque_add_bottom(__cilkrts_worker *const w, Closure *cl, worker_id pn) { w->g->deques[pn].top = cl; } } - -/* ANGE: remove closure for frame f from bottom of pn's deque and _really_ - * free them (i.e. not internal-free). As far as I can tell. - * This is called only in invoke_main_slow in invoke-main.c. - */ -void Cilk_remove_and_free_closure_and_frame(__cilkrts_worker *const w, - __cilkrts_stack_frame *f, - worker_id pn) { - Closure *t; - - deque_lock(w, pn); - t = deque_xtract_bottom(w, pn); - - CILK_ASSERT(w, t->frame == f); - USE_UNUSED(t); - deque_unlock(w, pn); - - /* ANGE: there is no splitter logging in the invoke_main frame */ - // Cilk_free(f); - // Closure_destroy_malloc(w, t); -} diff --git a/runtime/readydeque.h b/runtime/readydeque.h index f4c2a771..608c4589 100644 --- a/runtime/readydeque.h +++ b/runtime/readydeque.h @@ -54,13 +54,4 @@ CHEETAH_INTERNAL void deque_add_bottom(__cilkrts_worker *const w, Closure *cl, CHEETAH_INTERNAL void deque_assert_is_bottom(__cilkrts_worker *const w, Closure *t); - -/* ANGE: remove closure for frame f from bottom of pn's deque and _really_ - * free them (i.e. not internal-free). As far as I can tell. - * This is called only in invoke_main_slow in invoke-main.c. - */ -CHEETAH_INTERNAL -void Cilk_remove_and_free_closure_and_frame(__cilkrts_worker *const w, - __cilkrts_stack_frame *f, - worker_id pn); #endif diff --git a/runtime/reducer_impl.c b/runtime/reducer_impl.c index a86232dc..67d69934 100644 --- a/runtime/reducer_impl.c +++ b/runtime/reducer_impl.c @@ -1,18 +1,24 @@ // LONG_BIT is defined in limits.h when _GNU_SOURCE is defined. +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include "reducer_impl.h" #include "cilk/hyperobject_base.h" #include "global.h" #include "init.h" +#include "internal-malloc.h" #include "mutex.h" #include "scheduler.h" #include +#include #include #include #include #include +#define USE_INTERNAL_MALLOC 1 + #define REDUCER_LIMIT 1024U #define GLOBAL_REDUCER_LIMIT 100U @@ -35,9 +41,6 @@ typedef struct reducer_id_manager { __cilkrts_hyperobject_base **global; } reducer_id_manager; -/* TODO: Consider how this interacts with multiple Cilks. - It could be thread local. */ -static struct reducer_id_manager *id_manager; static void reducer_id_manager_assert_ownership(reducer_id_manager *m, __cilkrts_worker *const w) { @@ -64,10 +67,12 @@ static void reducer_id_manager_unlock(reducer_id_manager *m, static reducer_id_manager *init_reducer_id_manager(hyper_id_t cap) { size_t align = sizeof(reducer_id_manager) > 32 ? 64 : 32; - reducer_id_manager *m = - cilk_aligned_alloc(align, sizeof(reducer_id_manager)); + // To appease tools such as Valgrind and ThreadSanitizer, ensure that the + // allocated size matches the alignment. + size_t alloc_size = (sizeof(reducer_id_manager) + align - 1) & ~(align - 1); + reducer_id_manager *m = cilk_aligned_alloc(align, alloc_size); memset(m, 0, sizeof *m); - cilkrts_alert(ALERT_BOOT, NULL, "(reducers_init) Initializing reducers"); + cilkrts_alert(BOOT, NULL, "(reducers_init) Initializing reducers"); cap = (cap + LONG_BIT - 1) / LONG_BIT * LONG_BIT; /* round up */ CILK_ASSERT_G(cap > 0 && cap < 9999999); pthread_mutex_init(&m->mutex, NULL); @@ -116,8 +121,7 @@ static hyper_id_t reducer_id_get(reducer_id_manager *m, __cilkrts_worker *w) { } } } - cilkrts_alert(ALERT_REDUCE_ID, w, "allocate reducer ID %lu", - (unsigned long)id); + cilkrts_alert(REDUCE_ID, w, "allocate reducer ID %lu", (unsigned long)id); m->next = id + 1 >= m->spa_cap ? 0 : id + 1; if (id >= m->hwm) m->hwm = id + 1; @@ -130,18 +134,11 @@ static hyper_id_t reducer_id_get(reducer_id_manager *m, __cilkrts_worker *w) { } static void reducer_id_free(__cilkrts_worker *const ws, hyper_id_t id) { - global_state *g = ws ? ws->g : NULL; - reducer_id_manager *m = NULL; - if (g) { - m = g->id_manager; - CILK_ASSERT(ws, !id_manager); - } else { - m = id_manager; - CILK_ASSERT(ws, m); - } + global_state *g = ws ? ws->g : default_cilkrts; + reducer_id_manager *m = g->id_manager; reducer_id_manager_lock(m, ws); - cilkrts_alert(ALERT_REDUCE_ID, ws, "free reducer ID %lu of %lu", - (unsigned long)id, m->spa_cap); + cilkrts_alert(REDUCE_ID, ws, "free reducer ID %lu of %lu", + (unsigned long)id, (unsigned long)m->spa_cap); CILK_ASSERT(ws, id < m->spa_cap); CILK_ASSERT(ws, m->used[id / LONG_BIT] & (1UL << id % LONG_BIT)); m->used[id / LONG_BIT] &= ~(1UL << id % LONG_BIT); @@ -160,22 +157,14 @@ void reducers_init(global_state *g) { is single threaded. */ if (g->id_manager) { return; - } else if (id_manager) { - g->id_manager = id_manager; - id_manager = NULL; } else { g->id_manager = init_reducer_id_manager(REDUCER_LIMIT); } } void reducers_deinit(global_state *g) { - cilkrts_alert(ALERT_BOOT, NULL, "(reducers_deinit) Cleaning up reducers"); - CILK_ASSERT_G(!id_manager); - if (false) { /* TODO: If the reducer set is empty, discard. */ - free_reducer_id_manager(g->id_manager); - } else { - id_manager = g->id_manager; - } + cilkrts_alert(BOOT, NULL, "(reducers_deinit) Cleaning up reducers"); + free_reducer_id_manager(g->id_manager); g->id_manager = NULL; } @@ -213,8 +202,7 @@ static cilkred_map *install_new_reducer_map(__cilkrts_worker *w) { h = cilkred_map_make_map(w, m->spa_cap); w->reducer_map = h; - cilkrts_alert(ALERT_REDUCE, w, - "(install_new_reducer_map) installed reducer_map %p", h); + cilkrts_alert(REDUCE, w, "installed reducer map %p", (void *)h); return h; } @@ -224,21 +212,25 @@ static cilkred_map *install_new_reducer_map(__cilkrts_worker *w) { void __cilkrts_hyper_destroy(__cilkrts_hyperobject_base *key) { __cilkrts_worker *w = __cilkrts_get_tls_worker(); + // If we don't have a worker, use instead the last exiting worker from the + // default CilkRTS. + if (!w) + w = default_cilkrts->workers[default_cilkrts->exiting_worker]; hyper_id_t id = key->__id_num; + cilkrts_alert(REDUCE_ID, w, "Destroy reducer %x at %p", (unsigned)id, key); if (!__builtin_expect(id & HYPER_ID_VALID, HYPER_ID_VALID)) { - cilkrts_bug(w, "unregistering unregistered hyperobject"); + cilkrts_bug(w, "unregistering unregistered hyperobject %p", key); return; } id &= ~HYPER_ID_VALID; key->__id_num = id; if (w) { - const char *UNSYNCED_REDUCER_MSG = - "Destroying a reducer while it is visible to unsynced child tasks, " - "or\n" - "calling CILK_C_UNREGISTER_REDUCER() on an unregistered reducer.\n" - "Did you forget a _Cilk_sync or CILK_C_REGISTER_REDUCER()?"; +#define UNSYNCED_REDUCER_MSG \ + "Destroying a reducer while it is visible to unsynced child tasks, or\n" \ + "calling CILK_C_UNREGISTER_REDUCER() on an unregistered reducer.\n" \ + "Did you forget a _Cilk_sync or CILK_C_REGISTER_REDUCER()?" cilkred_map *h = w->reducer_map; if (NULL == h) @@ -259,12 +251,10 @@ void __cilkrts_hyper_create(__cilkrts_hyperobject_base *key) { reducer_id_manager *m = NULL; if (__builtin_expect(!w, 0)) { - m = id_manager; - if (__builtin_expect(!m, 0)) { - cilkrts_alert(ALERT_BOOT, NULL, - "(reducers_init) Initializing reducers"); - id_manager = m = init_reducer_id_manager(REDUCER_LIMIT); - } + // Use the ID manager of the last exiting worker from the default + // CilkRTS. + m = default_cilkrts->id_manager; + w = default_cilkrts->workers[default_cilkrts->exiting_worker]; } else { m = w->g->id_manager; } @@ -272,6 +262,8 @@ void __cilkrts_hyper_create(__cilkrts_hyperobject_base *key) { hyper_id_t id = reducer_id_get(m, w); key->__id_num = id | HYPER_ID_VALID; + cilkrts_alert(REDUCE_ID, w, "Create reducer %x at %p", (unsigned)id, key); + if (__builtin_expect(!w, 0)) { if (id >= GLOBAL_REDUCER_LIMIT) { cilkrts_bug(w, "Global reducer pool exhausted"); @@ -312,7 +304,8 @@ void *__cilkrts_hyper_lookup(__cilkrts_hyperobject_base *key) { hyper_id_t id = key->__id_num; if (!__builtin_expect(id & HYPER_ID_VALID, HYPER_ID_VALID)) { - cilkrts_bug(w, "User error: reference to unregistered hyperobject"); + cilkrts_bug(w, "User error: reference to unregistered hyperobject %p", + key); } id &= ~HYPER_ID_VALID; @@ -323,8 +316,8 @@ void *__cilkrts_hyper_lookup(__cilkrts_hyperobject_base *key) { /* TODO: If this is the first reference to a reducer created at global scope, install the leftmost view. */ - if(w->g->options.force_reduce) { - CILK_ASSERT(w, w->g->options.nproc == 1); + if (w->g->options.force_reduce) { + CILK_ASSERT(w, w->g->nworkers == 1); promote_own_deque(w); } @@ -341,10 +334,10 @@ void *__cilkrts_hyper_lookup(__cilkrts_hyperobject_base *key) { if (vinfo == NULL) { CILK_ASSERT(w, id < h->spa_cap); vinfo = &h->vinfo[id]; + CILK_ASSERT(w, vinfo->key == NULL && vinfo->val == NULL); void *val = key->__c_monoid.allocate_fn(key, key->__view_size); key->__c_monoid.identity_fn(key, val); - CILK_ASSERT(w, vinfo->key == NULL && vinfo->val == NULL); // allocate space for the val and initialize it to identity vinfo->key = key; @@ -354,16 +347,63 @@ void *__cilkrts_hyper_lookup(__cilkrts_hyperobject_base *key) { return vinfo->val; } -void *__cilkrts_hyper_alloc(void *ignore, size_t bytes) { - return cilk_aligned_alloc(16, bytes); /* ??? what is the best alignment? */ +void *__cilkrts_hyper_alloc(__cilkrts_hyperobject_base *key, size_t bytes) { + if (USE_INTERNAL_MALLOC) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (!w) + // Use instead the worker from the default CilkRTS that last exited + // a Cilkified region + w = default_cilkrts->workers[default_cilkrts->exiting_worker]; + return cilk_internal_malloc(w, bytes, IM_REDUCER_MAP); + } else + return cilk_aligned_alloc(16, bytes); } -void __cilkrts_hyper_dealloc(void *ignore, void *view) { free(view); } +void __cilkrts_hyper_dealloc(__cilkrts_hyperobject_base *key, void *view) { + if (USE_INTERNAL_MALLOC) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (!w) + // Use instead the worker from the default CilkRTS that last exited + // a Cilkified region + w = default_cilkrts->workers[default_cilkrts->exiting_worker]; + cilk_internal_free(w, view, key->__view_size, IM_REDUCER_MAP); + } else + free(view); +} // ================================================================= // Helper function for the scheduler // ================================================================= +#if DL_INTERPOSE +#define START_DL_INTERPOSABLE(func, type) \ + if (__builtin_expect(dl_##func == NULL, false)) { \ + dl_##func = (type)dlsym(RTLD_DEFAULT, #func); \ + if (__builtin_expect(dl_##func == NULL, false)) { \ + char *error = dlerror(); \ + if (error != NULL) { \ + fputs(error, stderr); \ + fflush(stderr); \ + abort(); \ + } \ + } \ + } + +typedef cilkred_map *(*merge_two_rmaps_t)(__cilkrts_worker *const, + cilkred_map *, + cilkred_map *); +static merge_two_rmaps_t dl___cilkrts_internal_merge_two_rmaps = NULL; + +cilkred_map *merge_two_rmaps(__cilkrts_worker *const ws, + cilkred_map *left, + cilkred_map *right) { + START_DL_INTERPOSABLE(__cilkrts_internal_merge_two_rmaps, + merge_two_rmaps_t); + + return dl___cilkrts_internal_merge_two_rmaps(ws, left, right); +} +#endif // DL_INTERPOSE + cilkred_map *__cilkrts_internal_merge_two_rmaps(__cilkrts_worker *const ws, cilkred_map *left, cilkred_map *right) { diff --git a/runtime/reducer_impl.h b/runtime/reducer_impl.h index 5a1ea6ab..9bc310fb 100644 --- a/runtime/reducer_impl.h +++ b/runtime/reducer_impl.h @@ -4,15 +4,32 @@ #include "cilk-internal.h" #include "cilkred_map.h" +// On MacOSX, the runtime needs to explicitly load +// __cilkrts_internal_merge_two_rmaps in order to allow Cilksan to dynamically +// interpose it. +#if defined(__APPLE__) && defined(__MACH__) +#define DL_INTERPOSE 1 +#else +#define DL_INTERPOSE 0 +#endif + CHEETAH_INTERNAL void reducers_init(global_state *); CHEETAH_INTERNAL void reducers_import(global_state *, __cilkrts_worker *); CHEETAH_INTERNAL void reducers_deinit(global_state *); // used by the scheduler +#if DL_INTERPOSE +CHEETAH_INTERNAL cilkred_map *merge_two_rmaps(__cilkrts_worker *const, + cilkred_map *left, + cilkred_map *right); +#else +#define merge_two_rmaps __cilkrts_internal_merge_two_rmaps +#endif // DL_INTERPOSE + // We give this method global visibility, so that tools, notably Cilksan, can // dynamically interpose the method. /* CHEETAH_INTERNAL */ -cilkred_map *__cilkrts_internal_merge_two_rmaps(__cilkrts_worker *, +cilkred_map *__cilkrts_internal_merge_two_rmaps(__cilkrts_worker *const, cilkred_map *left, cilkred_map *right); diff --git a/runtime/rts-config.h b/runtime/rts-config.h index 0220715e..30a95f5a 100644 --- a/runtime/rts-config.h +++ b/runtime/rts-config.h @@ -2,20 +2,27 @@ #define _CONFIG_H /* Functions defined in the library and visible outside the library. */ +#ifndef CHEETAH_API #if defined __BSD__ || defined __linux__ /* really, if using ELF */ #define CHEETAH_API __attribute((visibility("protected"))) #else #define CHEETAH_API #endif +#endif /* Functions defined in the library and not visible outside the library. */ +#ifndef CHEETAH_INTERNAL #define CHEETAH_INTERNAL __attribute((visibility("hidden"))) +#endif +#ifndef CHEETAH_INTERNAL_NORETURN #define CHEETAH_INTERNAL_NORETURN __attribute((noreturn, visibility("hidden"))) - +#endif #define __CILKRTS_VERSION 0x0 #define __CILKRTS_ABI_VERSION 3 +#ifndef CILK_DEBUG #define CILK_DEBUG 1 +#endif #define CILK_STATS 0 #define CILK_CACHE_LINE 64 @@ -43,7 +50,7 @@ #define DEFAULT_STACK_SIZE 0x100000 // 1 MBytes #define DEFAULT_FIBER_POOL_CAP 128 // initial per-worker fiber pool capacity #define DEFAULT_REDUCER_LIMIT 1024 -#define DEFAULT_FORCE_REDUCE 0 // do not self steal to force reduce +#define DEFAULT_FORCE_REDUCE 0 // do not self steal to force reduce #define MAX_CALLBACKS 32 // Maximum number of init or exit callbacks #endif // _CONFIG_H diff --git a/runtime/sched_stats.c b/runtime/sched_stats.c index 079f7334..7195a118 100644 --- a/runtime/sched_stats.c +++ b/runtime/sched_stats.c @@ -2,6 +2,8 @@ #include "cilk-internal.h" #include "debug.h" +#include "internal-malloc-impl.h" +#include "local.h" #include "sched_stats.h" #if SCHED_STATS @@ -89,6 +91,17 @@ void cilk_drop_timing(__cilkrts_worker *w, enum timing_type t) { } } +static void sched_stats_print_worker(__cilkrts_worker *w, void *data) { + FILE *fp = (FILE *)data; + fprintf(fp, WORKER_HDR_DESC, "Worker", w->self); + for (int t = 0; t < NUMBER_OF_STATS; t++) { + double tmp = cycles_to_micro_sec(w->l->stats.time[t]); + g->stats.time[t] += (double)tmp; + fprintf(fp, FIELD_DESC, micro_sec_to_sec(tmp)); + } + fprintf(fp, "\n"); +} + void cilk_sched_stats_print(struct global_state *g) { #define HDR_DESC "%15s" #define WORKER_HDR_DESC "%10s %3u:" @@ -101,16 +114,8 @@ void cilk_sched_stats_print(struct global_state *g) { } fprintf(stderr, "\n"); - for (int i = 0; i < g->options.nproc; i++) { - __cilkrts_worker *w = g->workers[i]; - fprintf(stderr, WORKER_HDR_DESC, "Worker", w->self); - for (int t = 0; t < NUMBER_OF_STATS; t++) { - double tmp = cycles_to_micro_sec(w->l->stats.time[t]); - g->stats.time[t] += (double)tmp; - fprintf(stderr, FIELD_DESC, micro_sec_to_sec(tmp)); - } - fprintf(stderr, "\n"); - } + for_each_worker(g, &sched_stats_print_worker, stderr); + fprintf(stderr, HDR_DESC, "Total:"); for (int t = 0; t < NUMBER_OF_STATS; t++) { fprintf(stderr, FIELD_DESC, micro_sec_to_sec(g->stats.time[t])); diff --git a/runtime/scheduler.c b/runtime/scheduler.c index 7b1106ca..1e7b8dd8 100644 --- a/runtime/scheduler.c +++ b/runtime/scheduler.c @@ -12,6 +12,7 @@ #include "fiber.h" #include "global.h" #include "jmpbuf.h" +#include "local.h" #include "readydeque.h" #include "scheduler.h" @@ -72,7 +73,9 @@ static void decrement_exception_pointer(__cilkrts_worker *const w, __cilkrts_worker *const victim_w, Closure *cl) { Closure_assert_ownership(w, cl); - CILK_ASSERT(w, cl->status == CLOSURE_RUNNING); + // It's possible that this steal attempt peeked the root closure from the + // top of a deque while a new Cilkified region was starting. + CILK_ASSERT(w, cl->status == CLOSURE_RUNNING || cl == w->g->root_closure); __cilkrts_stack_frame **exc = atomic_load_explicit(&victim_w->exc, memory_order_relaxed); if (exc != EXCEPTION_INFINITY) { @@ -82,9 +85,7 @@ static void decrement_exception_pointer(__cilkrts_worker *const w, static void reset_exception_pointer(__cilkrts_worker *const w, Closure *cl) { Closure_assert_ownership(w, cl); - CILK_ASSERT( - w, (cl->frame == NULL) || (cl->frame->worker == w) || - (cl == w->g->invoke_main && ((intptr_t)cl->frame->worker & 1))); + CILK_ASSERT(w, (cl->frame == NULL) || (cl->frame->worker == w)); atomic_store_explicit(&w->exc, atomic_load_explicit(&w->head, memory_order_relaxed), memory_order_release); @@ -92,7 +93,7 @@ static void reset_exception_pointer(__cilkrts_worker *const w, Closure *cl) { /* Unused for now but may be helpful later static void signal_immediate_exception_to_all(__cilkrts_worker *const w) { - int i, active_size = w->g->options.nproc; + int i, active_size = w->g->nworkers; __cilkrts_worker *curr_w; for(i=0; iframe->worker = w; Closure_set_status(w, t, CLOSURE_RUNNING); @@ -133,7 +134,7 @@ static void setup_for_sync(__cilkrts_worker *w, Closure *t) { CILK_ASSERT(w, w->l->fiber_to_free == NULL); CILK_ASSERT(w, t->fiber != t->fiber_child); - if(t->simulated_stolen == false) { + if (t->simulated_stolen == false) { // ANGE: note that in case a) this fiber won't get freed for awhile, // since we will longjmp back to the original function's fiber and // never go back to the runtime; we will only free it either once @@ -145,11 +146,12 @@ static void setup_for_sync(__cilkrts_worker *w, Closure *t) { } CILK_ASSERT(w, t->fiber); - // __cilkrts_alert(ALERT_STEAL | ALERT_FIBER, w, - // "(setup_for_sync) set t %p and t->fiber %p", t, t->fiber); + // __cilkrts_alert(STEAL | ALERT_FIBER, w, + // "(setup_for_sync) set t %p and t->fiber %p", (void *)t, + // (void *)t->fiber); __cilkrts_set_synced(t->frame); - CILK_ASSERT(w, w->current_stack_frame == t->frame); + CILK_ASSERT_POINTER_EQUAL(w, w->current_stack_frame, t->frame); SP(t->frame) = (void *)t->orig_rsp; t->orig_rsp = NULL; // unset once we have sync-ed @@ -186,12 +188,12 @@ static Closure *setup_call_parent_resumption(__cilkrts_worker *const w, deque_assert_ownership(w, w->self); Closure_assert_ownership(w, t); - CILK_ASSERT(w, w == __cilkrts_get_tls_worker()); + CILK_ASSERT_POINTER_EQUAL(w, w, __cilkrts_get_tls_worker()); CILK_ASSERT(w, __cilkrts_stolen(t->frame) != 0); CILK_ASSERT(w, t->frame != NULL); CILK_ASSERT(w, ((intptr_t)t->frame->worker) & 1); - CILK_ASSERT(w, w->head == w->tail); - CILK_ASSERT(w, w->current_stack_frame == t->frame); + CILK_ASSERT_POINTER_EQUAL(w, w->head, w->tail); + CILK_ASSERT_POINTER_EQUAL(w, w->current_stack_frame, t->frame); Closure_change_status(w, t, CLOSURE_SUSPENDED, CLOSURE_RUNNING); t->frame->worker = w; @@ -200,12 +202,11 @@ static Closure *setup_call_parent_resumption(__cilkrts_worker *const w, return t; } -CHEETAH_INTERNAL void Cilk_set_return(__cilkrts_worker *const w) { Closure *t; - cilkrts_alert(ALERT_RETURN, w, "(Cilk_set_return)"); + cilkrts_alert(RETURN, w, "(Cilk_set_return)"); deque_lock_self(w); t = deque_peek_bottom(w, w->self); @@ -217,37 +218,32 @@ void Cilk_set_return(__cilkrts_worker *const w) { // all rmaps from child or right sibling must have been reduced CILK_ASSERT(w, t->child_rmap == (cilkred_map *)NULL && t->right_rmap == (cilkred_map *)NULL); + CILK_ASSERT(w, t->call_parent); + CILK_ASSERT(w, t->spawn_parent == NULL); + CILK_ASSERT(w, (t->frame->flags & CILK_FRAME_DETACHED) == 0); + CILK_ASSERT(w, t->simulated_stolen == false); - if (t->call_parent != NULL) { - CILK_ASSERT(w, t->spawn_parent == NULL); - CILK_ASSERT(w, (t->frame->flags & CILK_FRAME_DETACHED) == 0); - CILK_ASSERT(w, t->simulated_stolen == false); + Closure *call_parent = t->call_parent; + Closure *t1 = deque_xtract_bottom(w, w->self); - Closure *call_parent = t->call_parent; - Closure *t1 = deque_xtract_bottom(w, w->self); - - USE_UNUSED(t1); - CILK_ASSERT(w, t == t1); - CILK_ASSERT(w, __cilkrts_stolen(t->frame)); + USE_UNUSED(t1); + CILK_ASSERT(w, t == t1); + CILK_ASSERT(w, __cilkrts_stolen(t->frame)); - t->frame = NULL; - Closure_unlock(w, t); + t->frame = NULL; + Closure_unlock(w, t); - Closure_lock(w, call_parent); - CILK_ASSERT(w, call_parent->fiber == t->fiber); - t->fiber = NULL; + Closure_lock(w, call_parent); + CILK_ASSERT(w, call_parent->fiber == t->fiber); + t->fiber = NULL; - Closure_remove_callee(w, call_parent); - setup_call_parent_resumption(w, call_parent); - Closure_unlock(w, call_parent); + Closure_remove_callee(w, call_parent); + setup_call_parent_resumption(w, call_parent); + Closure_unlock(w, call_parent); - Closure_destroy(w, t); - deque_add_bottom(w, call_parent, w->self); + Closure_destroy(w, t); + deque_add_bottom(w, call_parent, w->self); - } else { - CILK_ASSERT(w, t == w->g->invoke_main); - Closure_unlock(w, t); - } deque_unlock_self(w); } @@ -258,8 +254,8 @@ void Cilk_set_return(__cilkrts_worker *const w) { static Closure *unconditional_steal(__cilkrts_worker *const w, Closure *parent) { - cilkrts_alert(ALERT_STEAL, w, - "(unconditional_steal) promoted closure %p", parent); + cilkrts_alert(STEAL, w, "(unconditional_steal) promoted closure %p", + (void *)parent); Closure_assert_ownership(w, parent); CILK_ASSERT(w, parent->simulated_stolen); @@ -278,17 +274,16 @@ static Closure *unconditional_steal(__cilkrts_worker *const w, return parent; } - static Closure *provably_good_steal_maybe(__cilkrts_worker *const w, Closure *parent) { Closure_assert_ownership(w, parent); - // cilkrts_alert(ALERT_STEAL, w, "(provably_good_steal_maybe) cl %p", - // parent); + // cilkrts_alert(STEAL, w, "(provably_good_steal_maybe) cl %p", + // (void *)parent); CILK_ASSERT(w, !w->l->provably_good_steal); if (!Closure_has_children(parent) && parent->status == CLOSURE_SUSPENDED) { - // cilkrts_alert(ALERT_STEAL | ALERT_SYNC, w, + // cilkrts_alert(STEAL | ALERT_SYNC, w, // "(provably_good_steal_maybe) completing a sync"); CILK_ASSERT(w, parent->frame != NULL); @@ -301,8 +296,9 @@ static Closure *provably_good_steal_maybe(__cilkrts_worker *const w, CILK_ASSERT(w, parent->owner_ready_deque == NO_WORKER); Closure_make_ready(parent); - cilkrts_alert(ALERT_STEAL | ALERT_SYNC, w, - "(provably_good_steal_maybe) returned %p", parent); + cilkrts_alert(STEAL | ALERT_SYNC, w, + "(provably_good_steal_maybe) returned %p", + (void *)parent); return parent; } @@ -354,8 +350,8 @@ Closure *Closure_return(__cilkrts_worker *const w, Closure *child) { CILK_ASSERT(w, child->call_parent == NULL); CILK_ASSERT(w, parent != NULL); - cilkrts_alert(ALERT_RETURN, w, "(Closure_return) child %p, parent %p", - child, parent); + cilkrts_alert(RETURN, w, "(Closure_return) child %p, parent %p", + (void *)child, (void *)parent); /* The frame should have passed a sync successfully meaning it has not accumulated any maps from its children and the @@ -458,10 +454,10 @@ Closure *Closure_return(__cilkrts_worker *const w, Closure *child) { Closure_unlock(w, child); Closure_unlock(w, parent); if (left) { - active = __cilkrts_internal_merge_two_rmaps(w, left, active); + active = merge_two_rmaps(w, left, active); } if (right) { - active = __cilkrts_internal_merge_two_rmaps(w, active, right); + active = merge_two_rmaps(w, active, right); } w->reducer_map = active; Closure_lock(w, parent); @@ -505,7 +501,7 @@ Closure *Closure_return(__cilkrts_worker *const w, Closure *child) { --parent->join_counter; - if(parent->simulated_stolen) { + if (parent->simulated_stolen) { // parent stolen via simulated steal on worker's own deque res = unconditional_steal(w, parent); // must succeed CILK_ASSERT(w, parent->fiber && (parent->fiber_child == NULL)); @@ -536,9 +532,9 @@ Closure *Closure_return(__cilkrts_worker *const w, Closure *child) { cilkred_map *active = parent->user_rmap; atomic_store_explicit(&parent->child_rmap, NULL, memory_order_relaxed); parent->user_rmap = NULL; - w->reducer_map = __cilkrts_internal_merge_two_rmaps(w, child, active); + w->reducer_map = merge_two_rmaps(w, child, active); - if(parent->simulated_stolen) { + if (parent->simulated_stolen) { atomic_store_explicit(&parent->child_rmap, w->reducer_map, memory_order_relaxed); // force the continuation to create new views @@ -560,7 +556,7 @@ Closure *Closure_return(__cilkrts_worker *const w, Closure *child) { * CLOSURE_RETURNING */ static Closure *return_value(__cilkrts_worker *const w, Closure *t) { - cilkrts_alert(ALERT_RETURN, w, "(return_value) closure %p", t); + cilkrts_alert(RETURN, w, "(return_value) closure %p", (void *)t); Closure *res = NULL; CILK_ASSERT(w, t->status == CLOSURE_RETURNING); @@ -574,7 +570,7 @@ static Closure *return_value(__cilkrts_worker *const w, Closure *t) { // Not supported at the moment }*/ - cilkrts_alert(ALERT_RETURN, w, "(return_value) returning closure %p", t); + cilkrts_alert(RETURN, w, "(return_value) returning closure %p", (void *)t); return res; } @@ -602,7 +598,7 @@ void Cilk_exception_handler(char *exn) { CILK_ASSERT(w, t); Closure_lock(w, t); - cilkrts_alert(ALERT_EXCEPT, w, "(Cilk_exception_handler) closure %p!", t); + cilkrts_alert(EXCEPT, w, "(Cilk_exception_handler) closure %p!", (void *)t); /* ANGE: resetting the E pointer since we are handling the exception */ reset_exception_pointer(w, t); @@ -617,8 +613,7 @@ void Cilk_exception_handler(char *exn) { __cilkrts_stack_frame **tail = atomic_load_explicit(&w->tail, memory_order_relaxed); if (head > tail) { - cilkrts_alert(ALERT_EXCEPT, w, - "(Cilk_exception_handler) this is a steal!"); + cilkrts_alert(EXCEPT, w, "(Cilk_exception_handler) this is a steal!"); if (NULL != exn) t->user_exn.exn = exn; @@ -812,8 +807,8 @@ static Closure *promote_child(__cilkrts_worker *const w, * stacklet is stolen, and it's call parent is promoted into full and * suspended */ - CILK_ASSERT(w, - cl == w->g->invoke_main || cl->spawn_parent || cl->call_parent); + CILK_ASSERT(w, cl == w->g->root_closure || cl->spawn_parent || + cl->call_parent); Closure *spawn_parent = NULL; /* JFC: Should this load be relaxed or acquire? */ @@ -971,9 +966,9 @@ static Closure *extract_top_spawning_closure(__cilkrts_worker *const w, * and steal the parent */ child = promote_child(w, victim_w, cl, &res); - cilkrts_alert(ALERT_STEAL, w, - "(Closure_steal) promote gave cl/res/child = %p/%p/%p", - cl, res, child); + cilkrts_alert(STEAL, w, + "(Closure_steal) promote gave cl/res/child = %p/%p/%p", + (void *)cl, (void *)res, (void *)child); /* detach the parent */ if (res == (Closure *)NULL) { @@ -1008,6 +1003,18 @@ static Closure *Closure_steal(__cilkrts_worker *const w, int victim) { Closure *cl; Closure *res = (Closure *)NULL; __cilkrts_worker *victim_w; + victim_w = w->g->workers[victim]; + + // Fast test for an unsuccessful steal attempt using only read operations. + // This fast test seems to improve parallel performance. + { + __cilkrts_stack_frame **head = + atomic_load_explicit(&victim_w->head, memory_order_relaxed); + __cilkrts_stack_frame **tail = + atomic_load_explicit(&victim_w->tail, memory_order_relaxed); + if (head >= tail) + return NULL; + } //----- EVENT_STEAL_ATTEMPT if (deque_trylock(w, victim) == 0) { @@ -1022,18 +1029,17 @@ static Closure *Closure_steal(__cilkrts_worker *const w, int victim) { return NULL; } - // cilkrts_alert(ALERT_STEAL, "[%d]: trying steal from W%d; cl=%p", - // victim, cl); - victim_w = w->g->workers[victim]; + // cilkrts_alert(STEAL, "[%d]: trying steal from W%d; cl=%p", + // (void *)victim, (void *)cl); switch (cl->status) { case CLOSURE_RUNNING: /* send the exception to the worker */ if (do_dekker_on(w, victim_w, cl)) { - cilkrts_alert(ALERT_STEAL, w, - "(Closure_steal) can steal from W%d; cl=%p", - victim, cl); + cilkrts_alert(STEAL, w, + "(Closure_steal) can steal from W%d; cl=%p", + victim, (void *)cl); res = extract_top_spawning_closure(w, victim_w, cl); // at this point, more steals can happen from the victim. @@ -1049,11 +1055,12 @@ static Closure *Closure_steal(__cilkrts_worker *const w, int victim) { // ANGE: finish the promotion process in finish_promote finish_promote(w, victim_w, res, has_frames_to_promote); - cilkrts_alert(ALERT_STEAL, w, - "(Closure_steal) success; res %p has " - "fiber %p; child %p has fiber %p", - res, res->fiber, res->right_most_child, - res->right_most_child->fiber); + cilkrts_alert(STEAL, w, + "(Closure_steal) success; res %p has " + "fiber %p; child %p has fiber %p", + (void *)res, (void *)res->fiber, + (void *)res->right_most_child, + (void *)res->right_most_child->fiber); CILK_ASSERT(w, res->frame->worker == victim_w); Closure_unlock(w, res); } else { @@ -1070,8 +1077,12 @@ static Closure *Closure_steal(__cilkrts_worker *const w, int victim) { break; default: - cilkrts_bug(victim_w, "Bug: %s closure in ready deque", - Closure_status_to_str(cl->status)); + // It's possible that this steal attempt peeked the root closure + // from the top of a deque while a new Cilkified region was + // starting. + if (cl != w->g->root_closure) + cilkrts_bug(victim_w, "Bug: %s closure in ready deque", + Closure_status_to_str(cl->status)); } } else { deque_unlock(w, victim); @@ -1094,25 +1105,26 @@ static Closure *Closure_steal(__cilkrts_worker *const w, int victim) { ***/ void promote_own_deque(__cilkrts_worker *w) { - if(deque_trylock(w, w->self) == 0) { - cilkrts_bug(w, - "Bug: failed to acquire deque lock when promoting own deque"); + if (deque_trylock(w, w->self) == 0) { + cilkrts_bug( + w, "Bug: failed to acquire deque lock when promoting own deque"); return; } bool done = false; - while(!done) { + while (!done) { Closure *cl = deque_peek_top(w, w->self); CILK_ASSERT(w, cl); - CILK_ASSERT(w, cl->status== CLOSURE_RUNNING); + CILK_ASSERT(w, cl->status == CLOSURE_RUNNING); - if(Closure_trylock(w, cl) == 0) { + if (Closure_trylock(w, cl) == 0) { deque_unlock(w, w->self); - cilkrts_bug(w, + cilkrts_bug( + w, "Bug: failed to acquire deque lock when promoting own deque"); return; } - if(do_dekker_on(w, w, cl)) { + if (do_dekker_on(w, w, cl)) { // unfortunately this function releases both locks Closure *res = extract_top_spawning_closure(w, w, cl); CILK_ASSERT(w, res); @@ -1133,7 +1145,7 @@ void promote_own_deque(__cilkrts_worker *w) { } else { Closure_unlock(w, cl); deque_unlock(w, w->self); - done = true; // we can break out; no more frames to promote + done = true; // we can break out; no more frames to promote } } } @@ -1160,24 +1172,21 @@ void longjmp_to_user_code(__cilkrts_worker *w, Closure *t) { // only set in the personality function.) if ((sf->flags & CILK_FRAME_EXCEPTING) == 0) { CILK_ASSERT(w, t->orig_rsp == NULL); - CILK_ASSERT(w, ((char *)FP(sf) > fiber->m_stack) && - ((char *)FP(sf) < fiber->m_stack_base)); - CILK_ASSERT(w, ((char *)SP(sf) > fiber->m_stack) && - ((char *)SP(sf) < fiber->m_stack_base)); + CILK_ASSERT(w, (sf->flags & CILK_FRAME_LAST) || + in_fiber(fiber, (char *)FP(sf))); + CILK_ASSERT(w, in_fiber(fiber, (char *)SP(sf))); } sf->flags &= ~CILK_FRAME_EXCEPTING; w->l->provably_good_steal = false; } else { // this is stolen work; the fiber is a new fiber - // This is the first time we run the root frame, invoke_main - // init_fiber_run is going to setup the fiber for unning user code - // and "longjmp" into invoke_main (at the very beginning of the - // function) after user fiber is set up. - volatile bool *initialized = &w->g->invoke_main_initialized; - if (t == w->g->invoke_main && *initialized == 0) { + // This is the first time we run the root closure in this Cilkified + // region. The closure has been completely setup at this point by + // invoke_cilkified_root(). We just need jump to the user code. + volatile bool *initialized = &w->g->root_closure_initialized; + if (t == w->g->root_closure && *initialized == 0) { *initialized = true; - init_fiber_run(w, fiber, sf); - } else if(!t->simulated_stolen) { + } else if (!t->simulated_stolen) { void *new_rsp = sysdep_reset_stack_for_resume(fiber, sf); USE_UNUSED(new_rsp); CILK_ASSERT(w, SP(sf) == new_rsp); @@ -1189,7 +1198,7 @@ void longjmp_to_user_code(__cilkrts_worker *w, Closure *t) { } __attribute__((noreturn)) void longjmp_to_runtime(__cilkrts_worker *w) { - cilkrts_alert(ALERT_SCHED | ALERT_FIBER, w, "(longjmp_to_runtime)"); + cilkrts_alert(SCHED | ALERT_FIBER, w, "(longjmp_to_runtime)"); CILK_STOP_TIMING(w, INTERVAL_WORK); CILK_START_TIMING(w, INTERVAL_SCHED); @@ -1205,7 +1214,7 @@ __attribute__((noreturn)) void longjmp_to_runtime(__cilkrts_worker *w) { SYNC_NOT_READY to suspend the frame. */ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { - // cilkrts_alert(ALERT_SYNC, w, "(Cilk_sync) frame %p", frame); + // cilkrts_alert(SYNC, w, "(Cilk_sync) frame %p", (void *)frame); Closure *t; int res = SYNC_READY; @@ -1246,16 +1255,19 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { } if (Closure_has_children(t)) { - cilkrts_alert(ALERT_SYNC, w, - "(Cilk_sync) Closure %p has outstanding children", t); + cilkrts_alert(SYNC, w, + "(Cilk_sync) Closure %p has outstanding children", + (void *)t); // if we are syncing from the personality function (i.e. if an // exception in the continuation was thrown), we still need this // fiber for unwinding. if (t->user_exn.exn == NULL) { w->l->fiber_to_free = t->fiber; + } else { + t->saved_throwing_fiber = t->fiber; } - t->fiber = NULL; /* JFC: is this a leak? */ + t->fiber = NULL; // place holder for reducer map; the view in tlmm (if any) are // updated by the last strand in Closure t before sync; need to // reduce these when successful provably good steal occurs @@ -1265,8 +1277,8 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { t->user_rmap = reducers; /* set this after state change to suspended */ res = SYNC_NOT_READY; } else { - cilkrts_alert(ALERT_SYNC, w, - "(Cilk_sync) closure %p sync successfully", t); + cilkrts_alert(SYNC, w, "(Cilk_sync) closure %p sync successfully", + (void *)t); setup_for_sync(w, t); } @@ -1289,10 +1301,10 @@ int Cilk_sync(__cilkrts_worker *const w, __cilkrts_stack_frame *frame) { if (child_rmap) { atomic_store_explicit(&t->child_rmap, NULL, memory_order_relaxed); /* reducer_map may be accessed without lock */ - w->reducer_map = __cilkrts_internal_merge_two_rmaps(w, child_rmap, - w->reducer_map); + w->reducer_map = merge_two_rmaps(w, child_rmap, w->reducer_map); } - if(t->simulated_stolen) t->simulated_stolen = false; + if (t->simulated_stolen) + t->simulated_stolen = false; } return res; @@ -1303,7 +1315,7 @@ static Closure *do_what_it_says(__cilkrts_worker *w, Closure *t) { Closure *res = NULL; __cilkrts_stack_frame *f; - cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) closure %p", t); + cilkrts_alert(SCHED, w, "(do_what_it_says) closure %p", (void *)t); Closure_lock(w, t); switch (t->status) { @@ -1311,12 +1323,12 @@ static Closure *do_what_it_says(__cilkrts_worker *w, Closure *t) { // ANGE: anything we need to free must have been freed at this point CILK_ASSERT(w, w->l->fiber_to_free == NULL); - cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) CLOSURE_READY"); + cilkrts_alert(SCHED, w, "(do_what_it_says) CLOSURE_READY"); /* just execute it */ setup_for_execution(w, t); f = t->frame; // t->fiber->resume_sf = f; // I THINK this works - cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) resume_sf = %p", f); + cilkrts_alert(SCHED, w, "(do_what_it_says) resume_sf = %p", (void *)f); CILK_ASSERT(w, f); USE_UNUSED(f); Closure_unlock(w, t); @@ -1328,18 +1340,21 @@ static Closure *do_what_it_says(__cilkrts_worker *w, Closure *t) { deque_unlock_self(w); /* now execute it */ - cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) Jump into user code"); + cilkrts_alert(SCHED, w, "(do_what_it_says) Jump into user code"); // CILK_ASSERT(w, w->l->runtime_fiber != t->fiber); // cilk_fiber_suspend_self_and_resume_other(w->l->runtime_fiber, // t->fiber); - // cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) Back from user + // cilkrts_alert(SCHED, w, "(do_what_it_says) Back from user // code"); + // longjmp invalidates non-volatile variables + __cilkrts_worker *volatile w_save = w; if (__builtin_setjmp(w->l->rts_ctx) == 0) { worker_change_state(w, WORKER_RUN); longjmp_to_user_code(w, t); } else { - CILK_ASSERT(w, w == __cilkrts_get_tls_worker()); + w = w_save; + CILK_ASSERT_POINTER_EQUAL(w, w, __cilkrts_get_tls_worker()); worker_change_state(w, WORKER_SCHED); // CILK_ASSERT(w, t->fiber == w->l->fiber_to_free); if (w->l->fiber_to_free) { @@ -1351,7 +1366,7 @@ static Closure *do_what_it_says(__cilkrts_worker *w, Closure *t) { break; // ? case CLOSURE_RETURNING: - cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) CLOSURE_RETURNING"); + cilkrts_alert(SCHED, w, "(do_what_it_says) CLOSURE_RETURNING"); // the return protocol assumes t is not locked, and everybody // will respect the fact that t is returning Closure_unlock(w, t); @@ -1360,9 +1375,9 @@ static Closure *do_what_it_says(__cilkrts_worker *w, Closure *t) { break; // ? default: - cilkrts_alert(ALERT_SCHED, w, "(do_what_it_says) got status %d", - t->status); - cilkrts_bug(w, "BUG in do_what_it_says()"); + cilkrts_bug(w, "do_what_it_says invalid status %d", t->status); + cilkrts_bug(w, "do_what_it_says() closure status %s", + Closure_status_to_str(t->status)); break; } @@ -1396,7 +1411,7 @@ void worker_scheduler(__cilkrts_worker *w, Closure *t) { while (!t && !atomic_load_explicit(&w->g->done, memory_order_acquire)) { CILK_START_TIMING(w, INTERVAL_SCHED); CILK_START_TIMING(w, INTERVAL_IDLE); - unsigned int victim = rts_rand(w) % w->g->options.nproc; + unsigned int victim = rts_rand(w) % w->g->nworkers; if (victim != w->self) { t = Closure_steal(w, victim); } @@ -1437,7 +1452,10 @@ void worker_scheduler(__cilkrts_worker *w, Closure *t) { } } CILK_START_TIMING(w, INTERVAL_SCHED); - if (!atomic_load_explicit(&w->g->done, memory_order_acquire)) { + // If one Cilkified region stops and another one starts, then a worker + // can reach this point with t == NULL and w->g->done == false. Check + // that t is not NULL before calling do_what_it_says. + if (t) { // if provably-good steal happens, do_what_it_says will return // the next closure to execute t = do_what_it_says(w, t); diff --git a/runtime/types.h b/runtime/types.h index 2b61162d..d1d2cfba 100644 --- a/runtime/types.h +++ b/runtime/types.h @@ -4,6 +4,7 @@ #include typedef uint32_t worker_id; +#define WORKER_ID_FMT PRIu32 typedef struct __cilkrts_worker __cilkrts_worker; typedef struct __cilkrts_stack_frame __cilkrts_stack_frame; typedef struct global_state global_state;