From 85d4173c7494c28e39229bf13e82d5b89d8a6468 Mon Sep 17 00:00:00 2001 From: Balint Joo Date: Mon, 3 Jun 2024 15:52:57 +0000 Subject: [PATCH] Addressed further comments from @masterleinad --- benchmarks/async_alloc/async_alloc.cpp | 31 +++++++++++++------------- core/src/Cuda/Kokkos_CudaSpace.cpp | 16 ++++++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/benchmarks/async_alloc/async_alloc.cpp b/benchmarks/async_alloc/async_alloc.cpp index 075702891d9..bb3f287fce5 100644 --- a/benchmarks/async_alloc/async_alloc.cpp +++ b/benchmarks/async_alloc/async_alloc.cpp @@ -14,7 +14,7 @@ std::vector> inner_loop_times; // std::pair test(bool up) { int iters = 50; - size_t minimum = 8 / sizeof(float); // 64K + size_t minimum = 8 / sizeof(float); size_t gb = 1024 * 1024 * 1024 / sizeof(float); // number of floats per GiB size_t maximum = gb; // on 32 bit, we make 1GiB the max @@ -30,8 +30,10 @@ std::pair test(bool up) { } Kokkos::Timer first_alloc_timer; - { // Prime the pump - first long alloc -- Time it. - Kokkos::View dummy("unlabeled", 64); + { // Prime the pump - first long alloc -- Time it + // 64 bytes is an arbitrary number here. . + Kokkos::View dummy( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "unlabeled"), 64); } double first_alloc_time = first_alloc_timer.seconds(); @@ -43,7 +45,8 @@ std::pair test(bool up) { for (size_t num : sizes) { inner_loop_timer.reset(); for (int i = 0; i < iters; i++) { - Kokkos::View a("unlabeled", num); + Kokkos::View a( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "unlabeled"), num); } double inner_loop_time = inner_loop_timer.seconds(); @@ -67,7 +70,7 @@ int main(int argc, char *argv[]) { // Check the env var for reporting char *env_string = getenv("KOKKOS_CUDA_MEMPOOL_SIZE"); - std::cout << "Async Malloc Benchmark: KOKKOS_CUDA_MEMPOOL_SIZE is "; + std::cout << "# Async Malloc Benchmark: KOKKOS_CUDA_MEMPOOL_SIZE is "; if (env_string == nullptr) std::cout << "not set,"; @@ -75,28 +78,26 @@ int main(int argc, char *argv[]) { std::cout << " " << env_string << ","; if (up) - std::cout << " memory cycling upwards \n"; + std::cout << "# memory cycling upwards \n"; else - std::cout << " memory_cycling downwards \n"; + std::cout << "# memory_cycling downwards \n"; std::cout << std::flush; Kokkos::initialize(argc, argv); - inner_loop_times.reserve(34); - // Love structured bindings? const auto [first_alloc_time, alloc_loop_time] = test(up); - std::cout << "First Alloc: " << 64 << " bytes, " << first_alloc_time + if (!up) std::reverse(inner_loop_times.begin(), inner_loop_times.end()); + + std::cout << "# First Alloc: " << 64 << " bytes, " << first_alloc_time << " sec\n"; - std::cout << "Test Alloc Loop Total: " << alloc_loop_time << " sec\n"; - std::cout << "Alloc Loop Timings:\n"; - std::cout << "===================\n"; + std::cout << "# Test Alloc Loop Total: " << alloc_loop_time << " sec\n"; + std::cout << "# Alloc Loop Timings:\n"; + std::cout << "# ===================\n"; std::cout << "# size (B) \t time (sec) \n"; std::cout << "# -----------------------\n"; - std::sort(inner_loop_times.begin(), inner_loop_times.end(), - [=](const auto &a, const auto &b) { return a.first < b.first; }); for (auto pair : inner_loop_times) { std::cout << pair.first << ", " << pair.second << "\n"; } diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index 3f46866143f..603d06af00b 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -34,7 +34,6 @@ #include #include -// #include #include #include @@ -233,15 +232,15 @@ bool initializeMempool(const int device_id, const cudaStream_t stream, // Handle exception in case the string is unconvertible try { - requested_size = static_cast(std::stod(mempool_size_string)); + requested_size = std::stod(mempool_size_string); } catch (...) { std::cerr << "Unable to convert " << mempool_size_string << " to a number\n"; return false; } - // Check for non-positive size memory requests - if (requested_size <= 0) { + // Check for negative size memory requests (zero is allowed) + if (requested_size < 0) { std::cerr << "Negative amount of memory requested in allocation\n"; return false; } @@ -250,17 +249,18 @@ bool initializeMempool(const int device_id, const cudaStream_t stream, requested_size *= factor; // Check we are not asking for memory that is greater than what size_t can - // hold. Since requested can be the larger I convert the maximum of size_t - // to a double and compare those + // hold. Since the requested size can be the larger I convert the maximum of + // size_t to a double and compare those double max_size_t = static_cast(std::numeric_limits::max()); if (requested_size > max_size_t) { std::cerr << "Requested amount of memory " << requested_size << " exceeds " - << "maximum alloatable size " << max_size_t << "\n"; + << "maximum allocatable size " << max_size_t << "\n"; return false; } // At this point requested_size should be appropriate - // neither too big nor negative. + // neither too big nor negative - even with the ceiling it should + // not be bigger than max_size_t. Safe to cast size_t n_bytes = static_cast(std::ceil(requested_size)); // We set up the default memory pool