Merge pull request #271 from chillenzer/topic-flatterScatter

FlatterScatter
alpaka-group · Nov 26, 2024 · 127b29f · 127b29f
2 parents 76c4f46 + 7846a36
commit 127b29f
Show file tree

Hide file tree

Showing 40 changed files with 5,658 additions and 19,571 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,10 +1,7 @@
-name: pre-commit
-on:
-  pull_request:
-  push:
-    branches: [main, test-me-*]
+name: Continuous Integration
+on: [push, pull_request]
 jobs:
-  main:
+  pre-commit:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -14,3 +11,33 @@ jobs:
       - uses: pre-commit/[email protected]
       - uses: pre-commit-ci/[email protected]
         if: always()
+  cpu-tests:
+    # This action only runs on various CPU backends.
+    # As such, this is not a fully-fletched production-like test.
+    # Hopefully, it will still save us from a few stupid mistakes.
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: sudo apt update && sudo apt install libboost-all-dev
+      - run: mkdir build_dir
+      - working-directory: build_dir
+        run: |
+          git submodule init && git submodule update
+      - working-directory: build_dir
+        run: |
+          cmake .. \
+          -DCMAKE_CXX_FLAGS="-std=c++20 -g" \
+          -Dalpaka_CXX_STANDARD=20 \
+          -DmallocMC_CATCH2_PROVIDER=intern \
+          -Dalpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE:BOOL=ON \
+          -Dalpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE:BOOL=ON \
+          -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE:BOOL=ON \
+          -Dalpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE:BOOL=ON
+      - working-directory: build_dir
+        run: make -j tests examples
+      - working-directory: build_dir
+        run: ./tests
+      - working-directory: build_dir
+        run: ./mallocMC_Example01
+      - working-directory: build_dir
+        run: ./mallocMC_Example03
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "thirdParty/catch2"]
+	path = thirdParty/catch2
+	url = https://github.com/catchorg/catch2
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,27 +13,21 @@ set(mallocMC_ALPAKA_PROVIDER "intern" CACHE STRING "Select which alpaka is used"
 set_property(CACHE mallocMC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern")
 mark_as_advanced(mallocMC_ALPAKA_PROVIDER)
 if(${mallocMC_ALPAKA_PROVIDER} STREQUAL "intern")
-    set(alpaka_BUILD_EXAMPLES OFF)
-    set(BUILD_TESTING OFF)
-    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka)
+  set(alpaka_BUILD_EXAMPLES OFF)
+  set(BUILD_TESTING OFF)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka)
 else()
-    find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
+  find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
 endif()
 
 if(NOT TARGET alpaka::alpaka)
-    message(FATAL "Required mallocMC dependency alpaka could not be found!")
+  message(FATAL "Required mallocMC dependency alpaka could not be found!")
 endif()
 
 # Catch2
 set(mallocMC_CATCH2_PROVIDER "intern" CACHE STRING "Select which Catch2 is used")
 set_property(CACHE mallocMC_CATCH2_PROVIDER PROPERTY STRINGS "intern;extern")
 mark_as_advanced(mallocMC_CATCH2_PROVIDER)
-if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern")
-    add_library(Catch2::Catch2 INTERFACE IMPORTED)
-    target_include_directories(Catch2::Catch2 INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thirdParty/catch2/include)
-else()
-    find_package(Catch2 CONFIG REQUIRED)
-endif()
 
 # for installation, just copy include folder to install folder
 install(
@@ -44,11 +38,11 @@ install(
 # warnings
 add_library(warnings INTERFACE)
 if(CMAKE_COMPILER_IS_GNUCXX)
-    target_compile_options(warnings INTERFACE -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs)
+  target_compile_options(warnings INTERFACE -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs)
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-    target_compile_options(warnings INTERFACE -Wall -Wshadow)
+  target_compile_options(warnings INTERFACE -Wall -Wshadow)
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
-    target_compile_options(warnings INTERFACE -Minform=inform)
+  target_compile_options(warnings INTERFACE -Minform=inform)
 endif()
 
 # Executables
@@ -64,13 +58,32 @@ alpaka_add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_exam
 target_include_directories(mallocMC_Example03 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
 target_link_libraries(mallocMC_Example03 PUBLIC alpaka::alpaka warnings)
 
-alpaka_add_executable(VerifyHeap EXCLUDE_FROM_ALL tests/verify_heap.cpp tests/verify_heap_config.hpp)
-target_include_directories(VerifyHeap PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
-target_link_libraries(VerifyHeap PUBLIC alpaka::alpaka warnings)
+add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03)
 
-alpaka_add_executable(tests EXCLUDE_FROM_ALL tests/main.cpp tests/dimensions.cpp tests/policies.cpp)
-target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
-target_link_libraries(tests PUBLIC alpaka::alpaka Catch2::Catch2 warnings)
+if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern")
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/thirdParty/catch2 ${CMAKE_BINARY_DIR}/catch2)
+  include(Catch)
+else()
+  # get Catch2 v3 and build it from source with the same C++ standard as the tests
+  Include(FetchContent)
+  FetchContent_Declare(Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git GIT_TAG v3.7.1)
+  FetchContent_MakeAvailable(Catch2)
+  target_compile_features(Catch2 PUBLIC cxx_std_20)
+  include(Catch)
 
+  # hide Catch2 cmake variables by default in cmake gui
+  get_cmake_property(variables VARIABLES)
+  foreach (var ${variables})
+    if (var MATCHES "^CATCH_")
+      mark_as_advanced(${var})
+    endif()
+  endforeach()
+endif()
 
-add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03 VerifyHeap)
+file(GLOB_RECURSE testSources "${CMAKE_CURRENT_SOURCE_DIR}/tests/*/*.cpp")
+alpaka_add_executable(tests EXCLUDE_FROM_ALL ${testSources})
+catch_discover_tests(tests)
+source_group(TREE "${CMAKE_CURRENT_LIST_DIR}/tests" FILES ${testSources})
+target_compile_features(tests PRIVATE cxx_std_20)
+target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+target_link_libraries(tests PRIVATE alpaka::alpaka Catch2::Catch2WithMain)
diff --git a/LICENSE b/LICENSE
@@ -1,37 +1,40 @@
-mallocMC: Memory Allocation for Many Core Architectures
+/*
+  mallocMC: Memory Allocation for Many Core Architectures
 
-  based on the work of ScatterAlloc:
-  Massively Parallel Dynamic Memory Allocation for the GPU
+      based on the work of ScatterAlloc:
+      Massively Parallel Dynamic Memory Allocation for the GPU
 
-http://www.icg.tugraz.at/project/mvp
-https://www.hzdr.de/crp
+  http://www.icg.tugraz.at/project/mvp
+  https://www.hzdr.de/crp
 
-Copyright (C) 2012 Institute for Computer Graphics and Vision,
-                 Graz University of Technology
-Copyright (C) 2014-2015 Institute of Radiation Physics,
-                      Helmholtz-Zentrum Dresden - Rossendorf
+  Copyright (C) 2012 Institute for Computer Graphics and Vision,
+                     Graz University of Technology
+  Copyright (C) 2014-2024 Institute of Radiation Physics,
+                          Helmholtz-Zentrum Dresden - Rossendorf
 
-Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
-          Bernhard Kainz - kainz ( at ) icg.tugraz.at
-          Michael Kenzel - kenzel ( at ) icg.tugraz.at
-          Rene Widera - r.widera ( at ) hzdr.de
-          Axel Huebl - a.huebl ( at ) hzdr.de
-          Carlchristian Eckert - c.eckert ( at ) hzdr.de
+  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
+              Bernhard Kainz - kainz ( at ) icg.tugraz.at
+              Michael Kenzel - kenzel ( at ) icg.tugraz.at
+              Rene Widera - r.widera ( at ) hzdr.de
+              Axel Huebl - a.huebl ( at ) hzdr.de
+              Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian Lenz - j.lenz ( at ) hzdr.de
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
 
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
diff --git a/README.md b/README.md
@@ -5,39 +5,41 @@ mallocMC: *Memory Allocator for Many Core Architectures*
 
 This project provides a framework for **fast memory managers** on **many core
 accelerators**. It is based on [alpaka](https://github.com/alpaka-group/alpaka)
-to run on many different accelerators and implements the *ScatterAlloc* algorithm.
-
+to run on many different accelerators and comes with multiple allocation
+algorithms out-of-the-box. Custom ones can be added easily due to the
+policy-based design.
 
 Usage
 -------
 
 Follow the step-by-step instructions in [Usage.md](Usage.md) to replace your
 `new`/`malloc` calls with a *blacingly fast* mallocMC heap! :rocket:
 
-
 Install
 -------
 
 mallocMC is header-only, but requires a few other C++ libraries to be
 available. Our installation notes can be found in [INSTALL.md](INSTALL.md).
 
-
 Contributing
 ------------
 
-Rules for contributions are found in [CONTRIBUTING.md](CONTRIBUTING.md).
+Rules for contributions are found in [CONTRIBUTING.md](./CONTRIBUTING.md).
 
-On the ScatterAlloc Algorithm
+On the Algorithms
 -----------------------------
 
-This library implements the *ScatterAlloc* algorithm, originally
+This library was originally inspired by the *ScatterAlloc* algorithm,
 [forked](https://en.wikipedia.org/wiki/Fork_%28software_development%29)
 from the **ScatterAlloc** project, developed by the
 [Managed Volume Processing](http://www.icg.tugraz.at/project/mvp)
 group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at),
-TU Graz (kudos!).
+TU Graz (kudos!). The currently shipped algorithms are using similar ideas but
+differ from the original one significantly.
+
+From the original project page (which is no longer existent to the best of our
+knowledge):
 
-From http://www.icg.tugraz.at/project/mvp/downloads :
 ```quote
 ScatterAlloc is a dynamic memory allocator for the GPU. It is
 designed concerning the requirements of massively parallel
@@ -51,21 +53,18 @@ execution time is almost independent of the thread count.
 ScatterAlloc is open source and easy to use in your CUDA projects.
 ```
 
-Original Homepage: http://www.icg.tugraz.at/project/mvp
-
-Our Homepage: https://www.hzdr.de/crp
-
-
-Branches
---------
-
-| *branch*    | *state* | *description*           |
-| ----------- | ------- | ----------------------- |
-| **master**  | [![Build Status Master](https://travis-ci.org/alpaka-group/mallocMC.png?branch=master)](https://travis-ci.org/alpaka-group/mallocMC "master") | our latest stable release |
-| **dev**     | [![Build Status Development](https://travis-ci.org/alpaka-group/mallocMC.png?branch=dev)](https://travis-ci.org/alpaka-group/mallocMC "dev") | our development branch - start and merge new branches here |
-| **tugraz**  | n/a | *ScatterAlloc* "upstream" branch: not backwards compatible mirror for algorithmic changes |
+Our Homepage: <https://www.hzdr.de/crp>
 
+Versions and Releases
+---------------------
 
+Official releases can be found in the
+[Github releases](https://github.com/alpaka-group/mallocMC/releases).
+We try to stick to [semantic versioning](https://semver.org/) but we'll bump
+the major version number for major features.
+Development happens on the `dev` branch.
+Changes there have passed the CI and a code review but we make no guarantees
+about API or feature stability in this branch.
 
 Literature
 ----------
@@ -81,7 +80,6 @@ Just an incomplete link collection for now:
 - Junior Thesis [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34461.svg)](http://dx.doi.org/10.5281/zenodo.34461) by
   Carlchristian Eckert (2014)
 
-
 License
 -------
 

diff --git a/Usage.md b/Usage.md
@@ -13,21 +13,23 @@ There is one header file that will include *all* necessary files:
 Step 2a: choose policies
 -----------------------
 
-Each instance of a policy based allocator is composed through 5 **policies**. Each policy is expressed as a **policy class**.
+Each instance of a policy based allocator is composed through 5 **policies**.
+Each policy is expressed as a **policy class**.
 
 Currently, there are the following policy classes available:
 
 |Policy                 | Policy Classes (implementations) | description |
 |-------                |----------------------------------| ----------- |
-|**CreationPolicy**     | Scatter`<conf1,conf2>`         | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
-|                       | OldMalloc                        | device-side malloc/new and free/delete syscalls as implemented on NVidia CUDA graphics cards with compute capability sm_20 and higher |
-|**DistributionPolicy** | XMallocSIMD`<conf>`             | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
+|**CreationPolicy**     | Scatter`<conf1,conf2>`           | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
+|                       | FlatterScatter`<conf1,conf2>`    | Another scattered allocation algorithm similar in spirit to `Scatter` but with a flatter hierarchy and stronger concurrency invariants. `conf1` and `conf2` act as before.
+|                       | OldMalloc                        | Device-side malloc/new and free/delete syscalls as implemented on the given device.
+|**DistributionPolicy** | XMallocSIMD`<conf>`              | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
 |                       | Noop                             | no workload distribution at all |
 |**OOMPolicy**          | ReturnNull                       | pointers will be *nullptr*, if the request could not be fulfilled |
 |                       | ~~BadAllocException~~            | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions |
-|**ReservePoolPolicy**  | SimpleCudaMalloc                 | allocate a fixed heap with `CudaMalloc` |
+|**ReservePoolPolicy**  | AlpakaBuf                        | Allocate a fixed-size buffer in an `alpaka`-provided container. |
 |                       | CudaSetLimits                    | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) |
-|**AlignmentPolicy**    | Shrink`<conf>`                  | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
+|**AlignmentPolicy**    | Shrink`<conf>`                   | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
 |                       | Noop                             | no alignment at all |
 
 The user has to choose one of each policy that will form a useful allocator
@@ -51,6 +53,7 @@ struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties {
 
 Step 2c: combine policies
 -------------------------
+
 After configuring the chosen policies, they can be used as template
 parameters to create the desired allocator type:
 
@@ -86,7 +89,6 @@ Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated with
 template arguments to use the default configuration. `Shrink` however uses the
 configuration struct defined above.
 
-
 Step 3: instantiate allocator
 -----------------------------
 
@@ -100,8 +102,14 @@ The allocator object offers the following methods
 
 | Name | description |
 |---------------------- |-------------------------|
+| getAllocatorHandle()   | Acquire a handle from the allocator that can be used in kernels to allocate memory on device.
 | getAvailableSlots(size_t)   | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
 
+One should note that on a running system with multiple threads manipulating
+memory the information provided by `getAvailableSlots` is stale the moment it's
+acquired and so relying on this information to be accurate is not recommended.
+It is supposed to be used in initialisation/finalisation phases without dynamic
+memory allocations or in tests.
 
 Step 4: use dynamic memory allocation in a kernel
 -------------------------------------------------
@@ -114,9 +122,11 @@ The handle offers the following methods:
 |---------------------- |-------------------------|
 | malloc(size_t) | Allocates memory on the accelerator  |
 | free(size_t)     | Frees memory on the accelerator    |
-| getAvailableSlots()   | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
+| getAvailableSlots()   | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`).|
 
+The comments on `getAvailableSlots` from above hold all the same.
 A simplistic example would look like this:
+
 ```c++
 #include <mallocMC/mallocMC.hpp>