Skip to content

Commit

Permalink
Merge pull request #271 from chillenzer/topic-flatterScatter
Browse files Browse the repository at this point in the history
FlatterScatter
  • Loading branch information
psychocoderHPC authored Nov 26, 2024
2 parents 76c4f46 + 7846a36 commit 127b29f
Show file tree
Hide file tree
Showing 40 changed files with 5,658 additions and 19,571 deletions.
39 changes: 33 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
name: pre-commit
on:
pull_request:
push:
branches: [main, test-me-*]
name: Continuous Integration
on: [push, pull_request]
jobs:
main:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -14,3 +11,33 @@ jobs:
- uses: pre-commit/[email protected]
- uses: pre-commit-ci/[email protected]
if: always()
cpu-tests:
# This action only runs on various CPU backends.
# As such, this is not a fully-fletched production-like test.
# Hopefully, it will still save us from a few stupid mistakes.
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: sudo apt update && sudo apt install libboost-all-dev
- run: mkdir build_dir
- working-directory: build_dir
run: |
git submodule init && git submodule update
- working-directory: build_dir
run: |
cmake .. \
-DCMAKE_CXX_FLAGS="-std=c++20 -g" \
-Dalpaka_CXX_STANDARD=20 \
-DmallocMC_CATCH2_PROVIDER=intern \
-Dalpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE:BOOL=ON \
-Dalpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE:BOOL=ON \
-Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE:BOOL=ON \
-Dalpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE:BOOL=ON
- working-directory: build_dir
run: make -j tests examples
- working-directory: build_dir
run: ./tests
- working-directory: build_dir
run: ./mallocMC_Example01
- working-directory: build_dir
run: ./mallocMC_Example03
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "thirdParty/catch2"]
path = thirdParty/catch2
url = https://github.com/catchorg/catch2
55 changes: 34 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,21 @@ set(mallocMC_ALPAKA_PROVIDER "intern" CACHE STRING "Select which alpaka is used"
set_property(CACHE mallocMC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern")
mark_as_advanced(mallocMC_ALPAKA_PROVIDER)
if(${mallocMC_ALPAKA_PROVIDER} STREQUAL "intern")
set(alpaka_BUILD_EXAMPLES OFF)
set(BUILD_TESTING OFF)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka)
set(alpaka_BUILD_EXAMPLES OFF)
set(BUILD_TESTING OFF)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka)
else()
find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
endif()

if(NOT TARGET alpaka::alpaka)
message(FATAL "Required mallocMC dependency alpaka could not be found!")
message(FATAL "Required mallocMC dependency alpaka could not be found!")
endif()

# Catch2
set(mallocMC_CATCH2_PROVIDER "intern" CACHE STRING "Select which Catch2 is used")
set_property(CACHE mallocMC_CATCH2_PROVIDER PROPERTY STRINGS "intern;extern")
mark_as_advanced(mallocMC_CATCH2_PROVIDER)
if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern")
add_library(Catch2::Catch2 INTERFACE IMPORTED)
target_include_directories(Catch2::Catch2 INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thirdParty/catch2/include)
else()
find_package(Catch2 CONFIG REQUIRED)
endif()

# for installation, just copy include folder to install folder
install(
Expand All @@ -44,11 +38,11 @@ install(
# warnings
add_library(warnings INTERFACE)
if(CMAKE_COMPILER_IS_GNUCXX)
target_compile_options(warnings INTERFACE -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs)
target_compile_options(warnings INTERFACE -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs)
elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
target_compile_options(warnings INTERFACE -Wall -Wshadow)
target_compile_options(warnings INTERFACE -Wall -Wshadow)
elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
target_compile_options(warnings INTERFACE -Minform=inform)
target_compile_options(warnings INTERFACE -Minform=inform)
endif()

# Executables
Expand All @@ -64,13 +58,32 @@ alpaka_add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_exam
target_include_directories(mallocMC_Example03 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
target_link_libraries(mallocMC_Example03 PUBLIC alpaka::alpaka warnings)

alpaka_add_executable(VerifyHeap EXCLUDE_FROM_ALL tests/verify_heap.cpp tests/verify_heap_config.hpp)
target_include_directories(VerifyHeap PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
target_link_libraries(VerifyHeap PUBLIC alpaka::alpaka warnings)
add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03)

alpaka_add_executable(tests EXCLUDE_FROM_ALL tests/main.cpp tests/dimensions.cpp tests/policies.cpp)
target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
target_link_libraries(tests PUBLIC alpaka::alpaka Catch2::Catch2 warnings)
if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern")
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/thirdParty/catch2 ${CMAKE_BINARY_DIR}/catch2)
include(Catch)
else()
# get Catch2 v3 and build it from source with the same C++ standard as the tests
Include(FetchContent)
FetchContent_Declare(Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git GIT_TAG v3.7.1)
FetchContent_MakeAvailable(Catch2)
target_compile_features(Catch2 PUBLIC cxx_std_20)
include(Catch)

# hide Catch2 cmake variables by default in cmake gui
get_cmake_property(variables VARIABLES)
foreach (var ${variables})
if (var MATCHES "^CATCH_")
mark_as_advanced(${var})
endif()
endforeach()
endif()

add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03 VerifyHeap)
file(GLOB_RECURSE testSources "${CMAKE_CURRENT_SOURCE_DIR}/tests/*/*.cpp")
alpaka_add_executable(tests EXCLUDE_FROM_ALL ${testSources})
catch_discover_tests(tests)
source_group(TREE "${CMAKE_CURRENT_LIST_DIR}/tests" FILES ${testSources})
target_compile_features(tests PRIVATE cxx_std_20)
target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
target_link_libraries(tests PRIVATE alpaka::alpaka Catch2::Catch2WithMain)
63 changes: 33 additions & 30 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,37 +1,40 @@
mallocMC: Memory Allocation for Many Core Architectures
/*
mallocMC: Memory Allocation for Many Core Architectures

based on the work of ScatterAlloc:
Massively Parallel Dynamic Memory Allocation for the GPU
based on the work of ScatterAlloc:
Massively Parallel Dynamic Memory Allocation for the GPU

http://www.icg.tugraz.at/project/mvp
https://www.hzdr.de/crp
http://www.icg.tugraz.at/project/mvp
https://www.hzdr.de/crp

Copyright (C) 2012 Institute for Computer Graphics and Vision,
Graz University of Technology
Copyright (C) 2014-2015 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Copyright (C) 2012 Institute for Computer Graphics and Vision,
Graz University of Technology
Copyright (C) 2014-2024 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf

Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at
Bernhard Kainz - kainz ( at ) icg.tugraz.at
Michael Kenzel - kenzel ( at ) icg.tugraz.at
Rene Widera - r.widera ( at ) hzdr.de
Axel Huebl - a.huebl ( at ) hzdr.de
Carlchristian Eckert - c.eckert ( at ) hzdr.de
Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at
Bernhard Kainz - kainz ( at ) icg.tugraz.at
Michael Kenzel - kenzel ( at ) icg.tugraz.at
Rene Widera - r.widera ( at ) hzdr.de
Axel Huebl - a.huebl ( at ) hzdr.de
Carlchristian Eckert - c.eckert ( at ) hzdr.de
Julian Lenz - j.lenz ( at ) hzdr.de

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
44 changes: 21 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,41 @@ mallocMC: *Memory Allocator for Many Core Architectures*

This project provides a framework for **fast memory managers** on **many core
accelerators**. It is based on [alpaka](https://github.com/alpaka-group/alpaka)
to run on many different accelerators and implements the *ScatterAlloc* algorithm.

to run on many different accelerators and comes with multiple allocation
algorithms out-of-the-box. Custom ones can be added easily due to the
policy-based design.

Usage
-------

Follow the step-by-step instructions in [Usage.md](Usage.md) to replace your
`new`/`malloc` calls with a *blacingly fast* mallocMC heap! :rocket:


Install
-------

mallocMC is header-only, but requires a few other C++ libraries to be
available. Our installation notes can be found in [INSTALL.md](INSTALL.md).


Contributing
------------

Rules for contributions are found in [CONTRIBUTING.md](CONTRIBUTING.md).
Rules for contributions are found in [CONTRIBUTING.md](./CONTRIBUTING.md).

On the ScatterAlloc Algorithm
On the Algorithms
-----------------------------

This library implements the *ScatterAlloc* algorithm, originally
This library was originally inspired by the *ScatterAlloc* algorithm,
[forked](https://en.wikipedia.org/wiki/Fork_%28software_development%29)
from the **ScatterAlloc** project, developed by the
[Managed Volume Processing](http://www.icg.tugraz.at/project/mvp)
group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at),
TU Graz (kudos!).
TU Graz (kudos!). The currently shipped algorithms are using similar ideas but
differ from the original one significantly.

From the original project page (which is no longer existent to the best of our
knowledge):

From http://www.icg.tugraz.at/project/mvp/downloads :
```quote
ScatterAlloc is a dynamic memory allocator for the GPU. It is
designed concerning the requirements of massively parallel
Expand All @@ -51,21 +53,18 @@ execution time is almost independent of the thread count.
ScatterAlloc is open source and easy to use in your CUDA projects.
```

Original Homepage: http://www.icg.tugraz.at/project/mvp

Our Homepage: https://www.hzdr.de/crp


Branches
--------

| *branch* | *state* | *description* |
| ----------- | ------- | ----------------------- |
| **master** | [![Build Status Master](https://travis-ci.org/alpaka-group/mallocMC.png?branch=master)](https://travis-ci.org/alpaka-group/mallocMC "master") | our latest stable release |
| **dev** | [![Build Status Development](https://travis-ci.org/alpaka-group/mallocMC.png?branch=dev)](https://travis-ci.org/alpaka-group/mallocMC "dev") | our development branch - start and merge new branches here |
| **tugraz** | n/a | *ScatterAlloc* "upstream" branch: not backwards compatible mirror for algorithmic changes |
Our Homepage: <https://www.hzdr.de/crp>

Versions and Releases
---------------------

Official releases can be found in the
[Github releases](https://github.com/alpaka-group/mallocMC/releases).
We try to stick to [semantic versioning](https://semver.org/) but we'll bump
the major version number for major features.
Development happens on the `dev` branch.
Changes there have passed the CI and a code review but we make no guarantees
about API or feature stability in this branch.

Literature
----------
Expand All @@ -81,7 +80,6 @@ Just an incomplete link collection for now:
- Junior Thesis [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34461.svg)](http://dx.doi.org/10.5281/zenodo.34461) by
Carlchristian Eckert (2014)


License
-------

Expand Down
26 changes: 18 additions & 8 deletions Usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,23 @@ There is one header file that will include *all* necessary files:
Step 2a: choose policies
-----------------------

Each instance of a policy based allocator is composed through 5 **policies**. Each policy is expressed as a **policy class**.
Each instance of a policy based allocator is composed through 5 **policies**.
Each policy is expressed as a **policy class**.

Currently, there are the following policy classes available:

|Policy | Policy Classes (implementations) | description |
|------- |----------------------------------| ----------- |
|**CreationPolicy** | Scatter`<conf1,conf2>` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
| | OldMalloc | device-side malloc/new and free/delete syscalls as implemented on NVidia CUDA graphics cards with compute capability sm_20 and higher |
|**DistributionPolicy** | XMallocSIMD`<conf>` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
|**CreationPolicy** | Scatter`<conf1,conf2>` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
| | FlatterScatter`<conf1,conf2>` | Another scattered allocation algorithm similar in spirit to `Scatter` but with a flatter hierarchy and stronger concurrency invariants. `conf1` and `conf2` act as before.
| | OldMalloc | Device-side malloc/new and free/delete syscalls as implemented on the given device.
|**DistributionPolicy** | XMallocSIMD`<conf>` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
| | Noop | no workload distribution at all |
|**OOMPolicy** | ReturnNull | pointers will be *nullptr*, if the request could not be fulfilled |
| | ~~BadAllocException~~ | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions |
|**ReservePoolPolicy** | SimpleCudaMalloc | allocate a fixed heap with `CudaMalloc` |
|**ReservePoolPolicy** | AlpakaBuf | Allocate a fixed-size buffer in an `alpaka`-provided container. |
| | CudaSetLimits | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) |
|**AlignmentPolicy** | Shrink`<conf>` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
|**AlignmentPolicy** | Shrink`<conf>` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
| | Noop | no alignment at all |

The user has to choose one of each policy that will form a useful allocator
Expand All @@ -51,6 +53,7 @@ struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties {
Step 2c: combine policies
-------------------------
After configuring the chosen policies, they can be used as template
parameters to create the desired allocator type:
Expand Down Expand Up @@ -86,7 +89,6 @@ Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated with
template arguments to use the default configuration. `Shrink` however uses the
configuration struct defined above.


Step 3: instantiate allocator
-----------------------------

Expand All @@ -100,8 +102,14 @@ The allocator object offers the following methods
| Name | description |
|---------------------- |-------------------------|
| getAllocatorHandle() | Acquire a handle from the allocator that can be used in kernels to allocate memory on device.
| getAvailableSlots(size_t) | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
One should note that on a running system with multiple threads manipulating
memory the information provided by `getAvailableSlots` is stale the moment it's
acquired and so relying on this information to be accurate is not recommended.
It is supposed to be used in initialisation/finalisation phases without dynamic
memory allocations or in tests.
Step 4: use dynamic memory allocation in a kernel
-------------------------------------------------
Expand All @@ -114,9 +122,11 @@ The handle offers the following methods:
|---------------------- |-------------------------|
| malloc(size_t) | Allocates memory on the accelerator |
| free(size_t) | Frees memory on the accelerator |
| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`) |
| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits<ScatterAllocator>::providesAvailableSlots`).|
The comments on `getAvailableSlots` from above hold all the same.
A simplistic example would look like this:
```c++
#include <mallocMC/mallocMC.hpp>
Expand Down
Loading

0 comments on commit 127b29f

Please sign in to comment.