From b48b7504318c27aea234b6e824668d0c31c8916f Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Fri, 8 Nov 2024 11:22:23 +0100 Subject: [PATCH 01/16] Update allocator.hpp --- src/include/mallocMC/allocator.hpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp index 9fe1f41b..7205f1d4 100644 --- a/src/include/mallocMC/allocator.hpp +++ b/src/include/mallocMC/allocator.hpp @@ -32,7 +32,6 @@ #include "mallocMC_allocator_handle.hpp" #include "mallocMC_constraints.hpp" #include "mallocMC_traits.hpp" -#include "mallocMC_utils.hpp" #include @@ -135,13 +134,7 @@ namespace mallocMC * @param size number of bytes */ template - ALPAKA_FN_HOST void - /* `volatile size_t size` is required to break clang optimizations which - * results into runtime errors. Observed in PIConGPU if size is known at - * compile time. The volatile workaround has no negative effects on the - * register usage in CUDA. - */ - alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t volatile size) + ALPAKA_FN_HOST void alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t size) { void* pool = reservePolicy.setMemPool(dev, size); std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size); From 4058862dcc9ac77241fa1c4537b7baaaeef90581 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Fri, 8 Nov 2024 11:26:43 +0100 Subject: [PATCH 02/16] Update device_allocator.hpp --- src/include/mallocMC/allocator.hpp | 2 +- src/include/mallocMC/device_allocator.hpp | 36 +++++++++++++++-------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp index 7205f1d4..74e0f5c6 100644 --- a/src/include/mallocMC/allocator.hpp +++ b/src/include/mallocMC/allocator.hpp @@ -112,11 +112,11 @@ namespace mallocMC using uint32 = std::uint32_t; public: - using CreationPolicy = T_CreationPolicy; using DistributionPolicy = T_DistributionPolicy; using OOMPolicy = T_OOMPolicy; using ReservePoolPolicy = T_ReservePoolPolicy; using AlignmentPolicy = T_AlignmentPolicy; + using CreationPolicy = T_CreationPolicy::template AlignmentAwarePolicy; using HeapInfoVector = std::vector; using DevAllocator = DeviceAllocator; using AllocatorHandle = AllocatorHandleImpl; diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp index 52e4e736..f9822a3d 100644 --- a/src/include/mallocMC/device_allocator.hpp +++ b/src/include/mallocMC/device_allocator.hpp @@ -2,10 +2,11 @@ mallocMC: Memory Allocator for Many Core Architectures. https://www.hzdr.de/crp - Copyright 2014 - 2015 Institute of Radiation Physics, + Copyright 2014 - 2024 Institute of Radiation Physics, Helmholtz-Zentrum Dresden - Rossendorf Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian J. Lenz - j.lenz ( at ) hzdr.de Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,9 +29,7 @@ #pragma once -#include "mallocMC_constraints.hpp" #include "mallocMC_traits.hpp" -#include "mallocMC_utils.hpp" #include @@ -58,7 +57,7 @@ namespace mallocMC typename T_DistributionPolicy, typename T_OOMPolicy, typename T_AlignmentPolicy> - class DeviceAllocator : public T_CreationPolicy + class DeviceAllocator : public T_CreationPolicy::template AlignmentAwarePolicy { using uint32 = std::uint32_t; @@ -68,24 +67,31 @@ namespace mallocMC using OOMPolicy = T_OOMPolicy; using AlignmentPolicy = T_AlignmentPolicy; - void* pool; - template ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t bytes) -> void* { + if(bytes == 0U) + { + return nullptr; + } bytes = AlignmentPolicy::applyPadding(bytes); DistributionPolicy distributionPolicy(acc); - uint32 const req_size = distributionPolicy.collect(acc, bytes); - void* memBlock = CreationPolicy::template create(acc, req_size); + const uint32 req_size = distributionPolicy.collect(acc, bytes); + void* memBlock = CreationPolicy::template AlignmentAwarePolicy::create(acc, req_size); if(CreationPolicy::isOOM(memBlock, req_size)) + { memBlock = OOMPolicy::handleOOM(memBlock); + } return distributionPolicy.distribute(acc, memBlock); } template - ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p) + ALPAKA_FN_ACC void free(const AlpakaAcc& acc, void* pointer) { - CreationPolicy::destroy(acc, p); + if(pointer != nullptr) + { + CreationPolicy::template AlignmentAwarePolicy::destroy(acc, pointer); + } } /** Provide the number of available free slots. @@ -101,9 +107,15 @@ namespace mallocMC { slotSize = AlignmentPolicy::applyPadding(slotSize); if constexpr(Traits::providesAvailableSlots) - return CreationPolicy::template getAvailableSlotsAccelerator(acc, slotSize); + { + return CreationPolicy::template AlignmentAwarePolicy::getAvailableSlotsAccelerator( + acc, + slotSize); + } else - return 0u; + { + return 0U; + } } }; From d752602f194b42e7bb1fd5655506f840fa7c7070 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Fri, 8 Nov 2024 11:29:33 +0100 Subject: [PATCH 03/16] Update old creation policies --- .../mallocMC/creationPolicies/OldMalloc.hpp | 7 +- .../mallocMC/creationPolicies/Scatter.hpp | 162 ++++++++---------- 2 files changed, 77 insertions(+), 92 deletions(-) diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp index c75534a1..22d643c9 100644 --- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp +++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp @@ -49,10 +49,13 @@ namespace mallocMC using uint32 = std::uint32_t; public: + template + using AlignmentAwarePolicy = OldMalloc; + static constexpr auto providesAvailableSlots = false; - template - ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) const -> void* + template + ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) const -> void* { return ::malloc(static_cast(bytes)); } diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp index 0de95765..84cbcd2d 100644 --- a/src/include/mallocMC/creationPolicies/Scatter.hpp +++ b/src/include/mallocMC/creationPolicies/Scatter.hpp @@ -34,11 +34,10 @@ #pragma once #include "../mallocMC_utils.hpp" -#include "Scatter.hpp" #include - -#include +#include +#include #include #include #include /* uint32_t */ @@ -47,6 +46,7 @@ #include #include #include +#include namespace mallocMC { @@ -106,10 +106,16 @@ namespace mallocMC */ template< class T_Config = ScatterConf::DefaultScatterConfig, - class T_Hashing = ScatterConf::DefaultScatterHashingParams> - class Scatter + class T_Hashing = ScatterConf::DefaultScatterHashingParams, + class T_AlignmentPolicy = void> + class ScatterImpl { public: + // TODO(lenz): This is a bit of a round trip due to a change of interface. A larger refactoring should + // remove this again. + template + using AlignmentAwarePolicy = ScatterImpl; + using HeapProperties = T_Config; using HashingProperties = T_Hashing; @@ -294,7 +300,8 @@ namespace mallocMC * @param spots number of bits that can be used * @return next free spot in the bitfield */ - static ALPAKA_FN_ACC inline auto nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32 + static ALPAKA_FN_ACC inline auto nextspot(auto const& acc, uint32 bitfield, uint32 spot, uint32 spots) + -> uint32 { uint32 const low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1)); uint32 const high_part = (bitfield << (spots - (spot + 1))); @@ -302,7 +309,7 @@ namespace mallocMC // wrap around the bitfields from the current spot to the left bitfield = (high_part | low_part) & selection_mask; // compute the step from the current spot in the bitfield - uint32 const step = ffs(~bitfield); + const uint32 step = alpaka::ffs(acc, static_cast>(~bitfield)); // and return the new spot return (spot + step) % spots; } @@ -344,9 +351,9 @@ namespace mallocMC // note: popc(old) == spots should be sufficient, // but if someone corrupts the memory we end up in an // endless loop in here... - if(popc(old) >= spots) + if(alpaka::popcount(acc, old) >= static_cast(spots)) return -1; - spot = nextspot(old, spot, spots); + spot = nextspot(acc, old, spot, spots); } } @@ -376,10 +383,10 @@ namespace mallocMC if(fullsegments != 32) return alpaka::math::min( acc, - 31, + 31U, alpaka::math::max( acc, - 0, + 0U, (int) pagesize - (int) fullsegments * segmentsize - (int) sizeof(uint32)) / chunksize); else @@ -410,8 +417,8 @@ namespace mallocMC uint32 spot = randInit() % segments; uint32 const mask = _ptes[page].bitmask; if((mask & (1u << spot)) != 0) - spot = nextspot(mask, spot, segments); - uint32 const tries = segments - popc(mask); + spot = nextspot(acc, mask, spot, segments); + const uint32 tries = segments - alpaka::popcount(acc, mask); uint32* onpagemasks = onPageMasksPosition(page, segments); for(uint32 i = 0; i < tries; ++i) { @@ -419,7 +426,7 @@ namespace mallocMC if(hspot != -1) return _page[page].data + (32 * spot + hspot) * chunksize; alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, 1u << spot); - spot = nextspot(mask, spot, segments); + spot = nextspot(acc, mask, spot, segments); } return 0; } @@ -542,19 +549,19 @@ namespace mallocMC * @return pointer to a free chunk on a page, 0 if we were unable to * obtain a free chunk */ - template - ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void* + template + ALPAKA_FN_ACC auto allocChunked(const AlpakaAcc& acc, uint32 bytes) -> void* { // use the minimal allocation size to increase the hit rate for small allocations. - uint32 const paddedMinChunkSize = AlignmentPolicy::applyPadding(minChunkSize); - uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); - uint32 const numpages = _numpages; - uint32 const pagesperblock = numpages / _accessblocks; - uint32 const reloff = warpSize * minAllocation / pagesize; - uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid() - + (hashingDistWP + hashingDistWPRel * reloff) * warpid()) - % pagesperblock; - uint32 const maxchunksize = alpaka::math::min( + const uint32 paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize); + const uint32 minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); + const uint32 numpages = _numpages; + const uint32 pagesperblock = numpages / _accessblocks; + const uint32 reloff = warpSize * minAllocation / pagesize; + const uint32 start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc) + + (hashingDistWP + hashingDistWPRel * reloff) * warpid(acc)) + % pagesperblock; + const uint32 maxchunksize = alpaka::math::min( acc, +pagesize, /* this clumping means that allocations of paddedMinChunkSize could have a waste exceeding the @@ -687,7 +694,7 @@ namespace mallocMC /** Take care that the meta data changes where we did not use atomics are propagated to all * other threads. */ - threadfenceDevice(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); /* Remove chunk information. * It is important that this call happened after page init is called because scatter malloc * is updating the chunksize without notify the action by increasing the page count @@ -742,8 +749,9 @@ namespace mallocMC // mark it as free uint32 const nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); uint32* onpagemasks = onPageMasksPosition(page, nMasks); - uint32 old - = alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); + /* currently unchecked: + * uint32 old = */ + alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); // always do this, since it might fail due to a // race-condition with addChunkHierarchy @@ -766,9 +774,9 @@ namespace mallocMC { uint32 const region = page / regionsize; alpaka::atomicOp(acc, (uint32*) (_regions + region), 0u); - uint32 const pagesperblock = _numpages / _accessblocks; - uint32 const block = page / pagesperblock; - if(warpid() + laneid() == 0) + const uint32 pagesperblock = _numpages / _accessblocks; + const uint32 block = page / pagesperblock; + if(warpid(acc) + laneid() == 0) alpaka::atomicOp(acc, (uint32*) &_firstfreeblock, block); } } @@ -817,7 +825,7 @@ namespace mallocMC uint32 endpage, uint32 bytes) -> void* { - uint32 const pagestoalloc = divup(bytes, pagesize); + const uint32 pagestoalloc = ceilingDivision(bytes, pagesize); uint32 freecount = 0; bool left_free = false; for(uint32 search_page = startpage + 1; search_page > endpage;) @@ -893,11 +901,11 @@ namespace mallocMC // only one thread per warp can acquire the mutex void* res = 0; // based on the alpaka backend the lanemask type can be 64bit - auto const mask = activemask(); - uint32_t const num = popc(mask); + const auto mask = alpaka::warp::activemask(acc); + const uint32_t num = alpaka::popcount(acc, mask); // based on the alpaka backend the lanemask type can be 64bit - auto const lanemask = lanemask_lt(); - uint32_t const local_id = popc(lanemask & mask); + const auto lanemask = lanemask_lt(acc); + const uint32_t local_id = alpaka::popcount(acc, lanemask & mask); for(unsigned int active = 0; active < num; ++active) if(active == local_id) res = allocPageBasedSingle(acc, bytes); @@ -913,11 +921,11 @@ namespace mallocMC template ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, void* mem, uint32 page, uint32 bytes) { - uint32 const pages = divup(bytes, pagesize); + const uint32 pages = ceilingDivision(bytes, pagesize); for(uint32 p = page; p < page + pages; ++p) _page[p].init(); - threadfenceDevice(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); for(uint32 p = page; p < page + pages; ++p) alpaka::atomicOp(acc, (uint32*) &_ptes[p].chunksize, bytes, 0u); @@ -931,8 +939,8 @@ namespace mallocMC * @param bytes number of bytes to allocate * @return pointer to the allocated memory */ - template - ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void* + template + ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) -> void* { if(bytes == 0) return 0; @@ -944,7 +952,7 @@ namespace mallocMC */ if(bytes <= pagesize) // chunck based - return allocChunked(acc, bytes); + return allocChunked(acc, bytes); else // allocate a range of pages return allocPageBased(acc, bytes); @@ -1008,23 +1016,6 @@ namespace mallocMC uint32 numpages = numregions * regionsize; // pointer is copied (copy is called page) Page* page = (Page*) memory; - // sec check for alignment - // copy is checked - // PointerEquivalent alignmentstatus = ((PointerEquivalent)page) - // & (16 -1); if(alignmentstatus != 0) - //{ - // if(linid == 0){ - // printf("c Before:\n"); - // printf("c dataAlignment: %d\n",16); - // printf("c Alignmentstatus: %d\n",alignmentstatus); - // printf("c size_t memsize %llu byte\n", memsize); - // printf("c void *memory %p\n", page); - // } - // //copy is adjusted, potentially pointer to higher address - // now. page =(Page*)(((PointerEquivalent)page) + 16 - - // alignmentstatus); if(linid == 0) printf("c Heap Warning: - // memory to use not 16 byte aligned...\n"); - //} // We have to calculate these values here, before using them for other things. // First calculate how many blocks of the given size fit our memory pages in principle. @@ -1056,16 +1047,6 @@ namespace mallocMC ptes = (PTE*) (page + numpages); regions = (uint32*) (ptes + numpages); - // if(linid == 0) printf("Heap info: wasting %d - // bytes\n",(((POINTEREQUIVALENT)memory) + memsize) - - // (POINTEREQUIVALENT)(regions + numregions)); - - // if(linid == 0 && alignmentstatus != 0){ - // printf("c Was shrinked automatically to:\n"); - // printf("c size_t memsize %llu byte\n", memsize); - // printf("c void *memory %p\n", page); - //} - for(uint32 i = linid; i < numpages; i += totalThreads) { ptes[i].init(); @@ -1116,11 +1097,7 @@ namespace mallocMC AlpakaAcc const& m_acc, T_DeviceAllocator* m_heap, void* m_heapmem, - size_t m_memsize) - { - m_heap->pool = m_heapmem; - m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); - }; + size_t m_memsize) { m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); }; using Dim = typename alpaka::trait::DimType::type; using Idx = typename alpaka::trait::IdxType::type; using VecType = alpaka::Vec; @@ -1199,7 +1176,7 @@ namespace mallocMC * @param stride the stride should be equal to the number of * different gids (and therefore of value max(gid)-1) */ - template + template ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction( AlpakaAcc const& acc, size_t slotSize, @@ -1223,7 +1200,7 @@ namespace mallocMC chunksize = alpaka::math::max( acc, (uint32) slotSize, - AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size + T_AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size slotcount += countFreeChunksInPage( acc, currentpage, @@ -1240,7 +1217,7 @@ namespace mallocMC { // 1 slot needs multiple pages if(gid > 0) return 0; // do this serially - uint32 const pagestoalloc = divup((uint32) slotSize, pagesize); + const uint32 pagestoalloc = ceilingDivision((uint32) slotSize, pagesize); uint32 freecount = 0; for(uint32 currentpage = _numpages; currentpage > 0;) { // this already includes all superblocks @@ -1295,9 +1272,9 @@ namespace mallocMC { auto const gid = alpaka::getIdx(acc).sum(); - auto const nWorker = alpaka::getWorkDiv(acc).prod(); - unsigned const temp = heapPtr->template getAvailaibleSlotsDeviceFunction< - typename T_DeviceAllocator::AlignmentPolicy>(acc, numBytes, gid, nWorker); + const auto nWorker = alpaka::getWorkDiv(acc).prod(); + const unsigned temp + = heapPtr->template getAvailaibleSlotsDeviceFunction(acc, numBytes, gid, nWorker); if(temp) alpaka::atomicOp(acc, slots, temp); }; @@ -1354,21 +1331,22 @@ namespace mallocMC * * @param slotSize the size of allocatable elements to count */ - template - ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned + template + ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(const AlpakaAcc& acc, size_t slotSize) -> unsigned { int const wId = warpid_withinblock(acc); // do not use warpid-function, since // this value is not guaranteed to // be stable across warp lifetime - uint32 const activeThreads = popc(activemask()); + const uint32 activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc)); + constexpr auto warpsize = warpSize; auto& activePerWarp = alpaka::declareSharedVar< - std::uint32_t[maxThreadsPerBlock / warpSize], + std::uint32_t[maxThreadsPerBlock / warpsize], __COUNTER__>(acc); // maximum number of warps in a block auto& warpResults - = alpaka::declareSharedVar(acc); + = alpaka::declareSharedVar], __COUNTER__>(acc); warpResults[wId] = 0; activePerWarp[wId] = 0; @@ -1383,16 +1361,13 @@ namespace mallocMC // printf("Block %d, id %d: activeThreads=%d // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); - unsigned const temp = this->template getAvailaibleSlotsDeviceFunction( - acc, - slotSize, - linearId, - activeThreads); + const unsigned temp + = this->template getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads); if(temp) alpaka::atomicOp(acc, &warpResults[wId], temp); alpaka::syncBlockThreads(acc); - threadfenceBlock(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Block{}); return warpResults[wId]; } @@ -1416,5 +1391,12 @@ namespace mallocMC } }; + template + struct Scatter + { + template + using AlignmentAwarePolicy = ScatterImpl; + }; + } // namespace CreationPolicies } // namespace mallocMC From e8529c4575342daf729dd381b72915b1b43e2f7b Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Fri, 8 Nov 2024 11:30:37 +0100 Subject: [PATCH 04/16] Update XMallocSIMD --- .../mallocMC/distributionPolicies/XMallocSIMD.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp index eb4f3d59..c4875a10 100644 --- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp +++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp @@ -37,7 +37,7 @@ #include "XMallocSIMD.hpp" #include - +#include #include #include #include @@ -125,7 +125,8 @@ namespace mallocMC // init with initial counter auto& warp_sizecounter - = alpaka::declareSharedVar(acc); + = alpaka::declareSharedVar()], __COUNTER__>( + acc); warp_sizecounter[warpid] = 16; // second half: make sure that all coalesced allocations can fit @@ -133,7 +134,7 @@ namespace mallocMC bool const coalescible = bytes > 0 && bytes < (pagesize / 32); #if(MALLOCMC_DEVICE_COMPILE) - threadcount = popc(ballot(coalescible)); + threadcount = alpaka::popcount(alpaka::warp::ballot(acc, coalescible)); #else threadcount = 1; // TODO #endif @@ -153,7 +154,8 @@ namespace mallocMC template ALPAKA_FN_ACC auto distribute(AlpakaAcc const& acc, void* allocatedMem) -> void* { - auto& warp_res = alpaka::declareSharedVar(acc); + auto& warp_res + = alpaka::declareSharedVar()], __COUNTER__>(acc); char* myalloc = (char*) allocatedMem; if(req_size && can_use_coalescing) From 2596432b50481645a37d050c7620bb3edf2d84d6 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Fri, 8 Nov 2024 11:31:45 +0100 Subject: [PATCH 05/16] Update utils --- src/include/mallocMC/mallocMC_utils.hpp | 241 ++++++++---------------- 1 file changed, 81 insertions(+), 160 deletions(-) diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp index c1c9b24f..2a2f7260 100644 --- a/src/include/mallocMC/mallocMC_utils.hpp +++ b/src/include/mallocMC/mallocMC_utils.hpp @@ -5,12 +5,13 @@ Copyright (C) 2012 Institute for Computer Graphics and Vision, Graz University of Technology - Copyright (C) 2014 Institute of Radiation Physics, + Copyright (C) 2014-2024 Institute of Radiation Physics, Helmholtz-Zentrum Dresden - Rossendorf Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at Michael Kenzel - kenzel ( at ) icg.tugraz.at Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,16 +35,14 @@ #pragma once #include +#include +#include #ifdef _MSC_VER # include #endif -#include #include -#include -#include -#include #include /* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__ @@ -56,39 +55,25 @@ namespace mallocMC { - template - class __PointerEquivalent - { - public: - using type = unsigned int; - }; - template<> - class __PointerEquivalent<8> - { - public: - using type = unsigned long long; - }; + template + constexpr uint32_t warpSize = 1U; -#if defined(__CUDA_ARCH__) - constexpr auto warpSize = 32; // TODO -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) -// defined: -// https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/clang/lib/Basic/Targets/AMDGPU.cpp#L400 -// overview wave front size: -// https://github.com/llvm/llvm-project/blob/efc063b621ea0c4d1e452bcade62f7fc7e1cc937/clang/test/Driver/amdgpu-macros.cl#L70-L115 -// gfx10XX has 32 threads per wavefront else 64 +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + constexpr uint32_t warpSize> = 32U; +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED # if(HIP_VERSION_MAJOR >= 4) - constexpr auto warpSize = __AMDGCN_WAVEFRONT_SIZE; + template + constexpr uint32_t warpSize> = __AMDGCN_WAVEFRONT_SIZE; # else - constexpr auto warpSize = 64; + template + constexpr uint32_t warpSize> = 64; # endif -#else - constexpr auto warpSize = 1; #endif - using PointerEquivalent = mallocMC::__PointerEquivalent::type; - ALPAKA_FN_ACC inline auto laneid() { #if defined(__CUDA_ARCH__) @@ -98,7 +83,7 @@ namespace mallocMC #elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__) return __lane_id(); #else - return 0u; + return 0U; #endif } @@ -109,82 +94,87 @@ namespace mallocMC * * @return current index of the warp */ - ALPAKA_FN_ACC inline auto warpid() + template + ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t { -#if defined(__CUDA_ARCH__) - std::uint32_t mywarpid; + return 0U; + } + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is + // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.` + __device__ inline auto warpid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t + { + std::uint32_t mywarpid = 0; asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid)); return mywarpid; -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) + } +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t + { // get wave id // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024 return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4)); -#else - return 0u; + } #endif + + template + ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t + { + return 0U; } - ALPAKA_FN_ACC inline auto smid() +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t { -#if defined(__CUDA_ARCH__) - std::uint32_t mysmid; + std::uint32_t mysmid = 0; asm("mov.u32 %0, %%smid;" : "=r"(mysmid)); return mysmid; -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - return __smid(); -#else - return 0u; + } #endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t + { + return __smid(); } +#endif - ALPAKA_FN_ACC inline auto lanemask_lt() + template + ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/) + { + return 0U; + } +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuCudaRt const& /*acc*/) { -#if defined(__CUDA_ARCH__) std::uint32_t lanemask; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask)); return lanemask; -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - return __lanemask_lt(); -#else - return 0u; -#endif } - - ALPAKA_FN_ACC inline auto ballot(int pred) - { -#if defined(__CUDA_ARCH__) - return __ballot_sync(__activemask(), pred); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return __ballot(pred); -#else - return 1u; #endif - } - ALPAKA_FN_ACC inline auto activemask() +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt const& /*acc*/) { -#if defined(__CUDA_ARCH__) - return __activemask(); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return ballot(1); -#else - return 1u; -#endif + return __lanemask_lt(); } +#endif - template - ALPAKA_FN_HOST_ACC inline auto divup(T a, T b) -> T - { - return (a + b - 1) / b; - } /** the maximal number threads per block, valid for sm_2.X - sm_7.5 * * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities */ - constexpr uint32_t maxThreadsPerBlock = 1024; + constexpr uint32_t maxThreadsPerBlock = 1024U; /** warp id within a cuda block * @@ -199,96 +189,27 @@ namespace mallocMC auto const localId = alpaka::mapIdx<1>( alpaka::getIdx(acc), alpaka::getWorkDiv(acc))[0]; - return localId / warpSize; + return localId / warpSize; } - template - ALPAKA_FN_ACC inline auto ffs(T mask) -> std::uint32_t + template && std::is_integral_v>> + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T { -#if defined(__CUDA_ARCH__) - return ::__ffs(mask); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return ::__ffsll(static_cast(mask)); -#else - if(mask == 0) - return 0; - auto i = 1u; - while((mask & 1) == 0) - { - mask >>= 1; - i++; - } - return i; -#endif - } - - template - ALPAKA_FN_ACC inline auto popc(T mask) -> std::uint32_t - { -#if defined(__CUDA_ARCH__) - return ::__popc(mask); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return ::__popcll(static_cast(mask)); -#else - // cf. - // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan - std::uint32_t count = 0; - while(mask) - { - count++; - mask &= mask - 1; - } - return count; -#endif + return (numerator + (denominator - 1)) / denominator; } - // Threadfence implementations will maybe moved later into alpaka - template - struct ThreadFence - { - // CPU only implementation - static void device() - { - std::atomic_thread_fence(std::memory_order_seq_cst); - } - - static void block() - { - std::atomic_thread_fence(std::memory_order_seq_cst); - } - }; - - template - struct ThreadFence, void> - { - static ALPAKA_FN_ACC void device() - { -#if MALLOCMC_DEVICE_COMPILE - __threadfence(); -#endif - } - - static ALPAKA_FN_ACC void block() - { -#if MALLOCMC_DEVICE_COMPILE - __threadfence_block(); -#endif - } - }; - - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void threadfenceDevice(T_Acc const& acc) + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf( + void const* const pointer, + void const* const start, + T_size const stepSize) -> std::make_signed_t { - ThreadFence::device(); + return std::distance(reinterpret_cast(start), reinterpret_cast(pointer)) / stepSize; } - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void threadfenceBlock(T_Acc const& acc) + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target) { - ThreadFence::block(); + return alpaka::atomicCas(acc, &target, static_cast(0U), static_cast(0U)); } } // namespace mallocMC From 0bdd7b5117fafe31ca27f96cccd16e51d0e96287 Mon Sep 17 00:00:00 2001 From: Third Party Date: Fri, 8 Nov 2024 12:20:33 +0100 Subject: [PATCH 06/16] Run clang-format --- .../mallocMC/creationPolicies/OldMalloc.hpp | 2 +- .../mallocMC/creationPolicies/Scatter.hpp | 53 ++++++++++--------- src/include/mallocMC/device_allocator.hpp | 4 +- .../distributionPolicies/XMallocSIMD.hpp | 1 + src/include/mallocMC/mallocMC_utils.hpp | 3 +- 5 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp index 22d643c9..13ee173d 100644 --- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp +++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp @@ -55,7 +55,7 @@ namespace mallocMC static constexpr auto providesAvailableSlots = false; template - ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) const -> void* + ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) const -> void* { return ::malloc(static_cast(bytes)); } diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp index 84cbcd2d..38117491 100644 --- a/src/include/mallocMC/creationPolicies/Scatter.hpp +++ b/src/include/mallocMC/creationPolicies/Scatter.hpp @@ -38,6 +38,7 @@ #include #include #include + #include #include #include /* uint32_t */ @@ -309,7 +310,7 @@ namespace mallocMC // wrap around the bitfields from the current spot to the left bitfield = (high_part | low_part) & selection_mask; // compute the step from the current spot in the bitfield - const uint32 step = alpaka::ffs(acc, static_cast>(~bitfield)); + uint32 const step = alpaka::ffs(acc, static_cast>(~bitfield)); // and return the new spot return (spot + step) % spots; } @@ -418,7 +419,7 @@ namespace mallocMC uint32 const mask = _ptes[page].bitmask; if((mask & (1u << spot)) != 0) spot = nextspot(acc, mask, spot, segments); - const uint32 tries = segments - alpaka::popcount(acc, mask); + uint32 const tries = segments - alpaka::popcount(acc, mask); uint32* onpagemasks = onPageMasksPosition(page, segments); for(uint32 i = 0; i < tries; ++i) { @@ -550,18 +551,18 @@ namespace mallocMC * obtain a free chunk */ template - ALPAKA_FN_ACC auto allocChunked(const AlpakaAcc& acc, uint32 bytes) -> void* + ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void* { // use the minimal allocation size to increase the hit rate for small allocations. - const uint32 paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize); - const uint32 minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); - const uint32 numpages = _numpages; - const uint32 pagesperblock = numpages / _accessblocks; - const uint32 reloff = warpSize * minAllocation / pagesize; - const uint32 start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc) + uint32 const paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize); + uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); + uint32 const numpages = _numpages; + uint32 const pagesperblock = numpages / _accessblocks; + uint32 const reloff = warpSize * minAllocation / pagesize; + uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc) + (hashingDistWP + hashingDistWPRel * reloff) * warpid(acc)) - % pagesperblock; - const uint32 maxchunksize = alpaka::math::min( + % pagesperblock; + uint32 const maxchunksize = alpaka::math::min( acc, +pagesize, /* this clumping means that allocations of paddedMinChunkSize could have a waste exceeding the @@ -774,8 +775,8 @@ namespace mallocMC { uint32 const region = page / regionsize; alpaka::atomicOp(acc, (uint32*) (_regions + region), 0u); - const uint32 pagesperblock = _numpages / _accessblocks; - const uint32 block = page / pagesperblock; + uint32 const pagesperblock = _numpages / _accessblocks; + uint32 const block = page / pagesperblock; if(warpid(acc) + laneid() == 0) alpaka::atomicOp(acc, (uint32*) &_firstfreeblock, block); } @@ -825,7 +826,7 @@ namespace mallocMC uint32 endpage, uint32 bytes) -> void* { - const uint32 pagestoalloc = ceilingDivision(bytes, pagesize); + uint32 const pagestoalloc = ceilingDivision(bytes, pagesize); uint32 freecount = 0; bool left_free = false; for(uint32 search_page = startpage + 1; search_page > endpage;) @@ -901,11 +902,11 @@ namespace mallocMC // only one thread per warp can acquire the mutex void* res = 0; // based on the alpaka backend the lanemask type can be 64bit - const auto mask = alpaka::warp::activemask(acc); - const uint32_t num = alpaka::popcount(acc, mask); + auto const mask = alpaka::warp::activemask(acc); + uint32_t const num = alpaka::popcount(acc, mask); // based on the alpaka backend the lanemask type can be 64bit - const auto lanemask = lanemask_lt(acc); - const uint32_t local_id = alpaka::popcount(acc, lanemask & mask); + auto const lanemask = lanemask_lt(acc); + uint32_t const local_id = alpaka::popcount(acc, lanemask & mask); for(unsigned int active = 0; active < num; ++active) if(active == local_id) res = allocPageBasedSingle(acc, bytes); @@ -921,7 +922,7 @@ namespace mallocMC template ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, void* mem, uint32 page, uint32 bytes) { - const uint32 pages = ceilingDivision(bytes, pagesize); + uint32 const pages = ceilingDivision(bytes, pagesize); for(uint32 p = page; p < page + pages; ++p) _page[p].init(); @@ -940,7 +941,7 @@ namespace mallocMC * @return pointer to the allocated memory */ template - ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) -> void* + ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void* { if(bytes == 0) return 0; @@ -1217,7 +1218,7 @@ namespace mallocMC { // 1 slot needs multiple pages if(gid > 0) return 0; // do this serially - const uint32 pagestoalloc = ceilingDivision((uint32) slotSize, pagesize); + uint32 const pagestoalloc = ceilingDivision((uint32) slotSize, pagesize); uint32 freecount = 0; for(uint32 currentpage = _numpages; currentpage > 0;) { // this already includes all superblocks @@ -1272,8 +1273,8 @@ namespace mallocMC { auto const gid = alpaka::getIdx(acc).sum(); - const auto nWorker = alpaka::getWorkDiv(acc).prod(); - const unsigned temp + auto const nWorker = alpaka::getWorkDiv(acc).prod(); + unsigned const temp = heapPtr->template getAvailaibleSlotsDeviceFunction(acc, numBytes, gid, nWorker); if(temp) alpaka::atomicOp(acc, slots, temp); @@ -1332,13 +1333,13 @@ namespace mallocMC * @param slotSize the size of allocatable elements to count */ template - ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(const AlpakaAcc& acc, size_t slotSize) -> unsigned + ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned { int const wId = warpid_withinblock(acc); // do not use warpid-function, since // this value is not guaranteed to // be stable across warp lifetime - const uint32 activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc)); + uint32 const activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc)); constexpr auto warpsize = warpSize; auto& activePerWarp = alpaka::declareSharedVar< @@ -1361,7 +1362,7 @@ namespace mallocMC // printf("Block %d, id %d: activeThreads=%d // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); - const unsigned temp + unsigned const temp = this->template getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads); if(temp) alpaka::atomicOp(acc, &warpResults[wId], temp); diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp index f9822a3d..0f6fe090 100644 --- a/src/include/mallocMC/device_allocator.hpp +++ b/src/include/mallocMC/device_allocator.hpp @@ -76,7 +76,7 @@ namespace mallocMC } bytes = AlignmentPolicy::applyPadding(bytes); DistributionPolicy distributionPolicy(acc); - const uint32 req_size = distributionPolicy.collect(acc, bytes); + uint32 const req_size = distributionPolicy.collect(acc, bytes); void* memBlock = CreationPolicy::template AlignmentAwarePolicy::create(acc, req_size); if(CreationPolicy::isOOM(memBlock, req_size)) { @@ -86,7 +86,7 @@ namespace mallocMC } template - ALPAKA_FN_ACC void free(const AlpakaAcc& acc, void* pointer) + ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* pointer) { if(pointer != nullptr) { diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp index c4875a10..fbfdd2d3 100644 --- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp +++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp @@ -38,6 +38,7 @@ #include #include + #include #include #include diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp index 2a2f7260..ad43eb49 100644 --- a/src/include/mallocMC/mallocMC_utils.hpp +++ b/src/include/mallocMC/mallocMC_utils.hpp @@ -36,6 +36,7 @@ #include #include + #include #ifdef _MSC_VER @@ -104,7 +105,7 @@ namespace mallocMC template // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.` - __device__ inline auto warpid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t + inline __device__ auto warpid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t { std::uint32_t mywarpid = 0; asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid)); From 745cb2a33313596a0109578bdbd73478156db0ab Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Tue, 12 Nov 2024 14:32:02 +0100 Subject: [PATCH 07/16] Add FlatterScatter --- .../creationPolicies/FlatterScatter.hpp | 456 ++++++++++ .../FlatterScatter/AccessBlock.hpp | 823 ++++++++++++++++++ .../FlatterScatter/BitField.hpp | 533 ++++++++++++ .../FlatterScatter/DataPage.hpp | 42 + .../FlatterScatter/PageInterpretation.hpp | 343 ++++++++ .../FlatterScatter/wrappingLoop.hpp | 73 ++ src/include/mallocMC/mallocMC.hpp | 1 + 7 files changed, 2271 insertions(+) create mode 100644 src/include/mallocMC/creationPolicies/FlatterScatter.hpp create mode 100644 src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp create mode 100644 src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp create mode 100644 src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp create mode 100644 src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp create mode 100644 src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp diff --git a/src/include/mallocMC/creationPolicies/FlatterScatter.hpp b/src/include/mallocMC/creationPolicies/FlatterScatter.hpp new file mode 100644 index 00000000..c57c3da4 --- /dev/null +++ b/src/include/mallocMC/creationPolicies/FlatterScatter.hpp @@ -0,0 +1,456 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + /** + * @class Heap + * @brief Main interface to our heap memory. + * + * This class stores the heap pointer and the heap size and provides the high-level functionality to interact with + * the memory within kernels. It is wrapped in a thin layer of creation policy to be instantiated as base class of + * the `DeviceAllocator` for the user. + * + * @tparam T_HeapConfig Struct containing information about the heap. + * @tparam T_HashConfig Struct providing a hash function for scattering and the blockStride property. + * @tparam T_AlignmentPolicy The alignment policy used in the current configuration. + */ + template + struct Heap + { + using MyAccessBlock = AccessBlock; + + static_assert( + T_HeapConfig::accessblocksize + < std::numeric_limits>::max(), + "Your access block size must be smaller than the maximal value of its signed type because we are using " + "differences in the code occasionally."); + + static_assert( + T_HeapConfig::pagesize < std::numeric_limits>::max(), + "Your page size must be smaller than the maximal value of its signed type because we are using " + "differences in the code occasionally."); + + static_assert( + T_HeapConfig::accessblocksize == sizeof(MyAccessBlock), + "The real access block must have the same size as configured in order to make alignment more easily " + "predictable."); + + size_t heapSize{}; + MyAccessBlock* accessBlocks{}; + uint32_t volatile block = 0U; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init() -> void + { + for(uint32_t i = 0; i < numBlocks(); ++i) + { + accessBlocks[i].init(); + } + } + + /** + * @brief Number of access blocks in the heap. This is a runtime quantity because it depends on the given heap + * size. + * + * @return Number of access blocks in the heap. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBlocks() const -> uint32_t + { + return heapSize / T_HeapConfig::accessblocksize; + } + + /** + * @brief The dummy value to indicate the case of no free blocks found. + * + * @return An invalid block index for identifying such case. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto noFreeBlockFound() const -> uint32_t + { + return numBlocks(); + } + + /** + * @brief Compute a starting index to search the access blocks for a valid piece of memory. + * + * @param blockValue Current starting index to compute the next one from. + * @param hashValue A hash value to provide some entropy for scattering the requests. + * @return An index to start search the access blocks from. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto startBlockIndex( + auto const& /*acc*/, + uint32_t const blockValue, + uint32_t const hashValue) + { + return ((hashValue % T_HashConfig::blockStride) + (blockValue * T_HashConfig::blockStride)) % numBlocks(); + } + + /** + * @brief Create a pointer to memory of (at least) `bytes` number of bytes.. + * + * @param bytes Size of the allocation in number of bytes. + * @return Pointer to the memory, nullptr if no usable memory was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t const bytes) -> void* + { + auto blockValue = block; + auto hashValue = T_HashConfig::template hash(acc, bytes); + auto startIdx = startBlockIndex(acc, blockValue, hashValue); + return wrappingLoop( + acc, + startIdx, + numBlocks(), + static_cast(nullptr), + [this, bytes, startIdx, &hashValue, blockValue](auto const& localAcc, auto const index) mutable + { + auto ptr = accessBlocks[index].create(localAcc, bytes, hashValue); + if(!ptr && index == startIdx) + { + // This is not thread-safe but we're fine with that. It's just a fuzzy thing to occasionally + // increment and it's totally okay if its value is not quite deterministic. + if(blockValue == block) + { + block = blockValue + 1; + } + } + return ptr; + }); + } + + /** + * @brief Counterpart free'ing operation to `create`. Destroys the memory at the pointer location. + * + * @param pointer A valid pointer created by `create()`.` + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(AlpakaAcc const& acc, void* pointer) -> void + { + // indexOf requires the access block size instead of blockSize in case the reinterpreted AccessBlock + // object is smaller than blockSize. + auto blockIndex = indexOf(pointer, accessBlocks, sizeof(MyAccessBlock)); + accessBlocks[blockIndex].destroy(acc, pointer); + } + + /** + * @brief Queries all access blocks how many chunks of the given chunksize they could allocate. This is + * single-threaded and NOT THREAD-SAFE but acquiring such distributed information while other threads operate + * on the heap is of limited value anyways. + * + * @param chunkSize Target would-be-created chunk size in number of bytes. + * @return The number of allocations that would still be possible with this chunk size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlotsDeviceFunction(auto const& acc, uint32_t const chunkSize) + -> size_t + { + // TODO(lenz): Not thread-safe. + return std::transform_reduce( + accessBlocks, + accessBlocks + numBlocks(), + 0U, + std::plus{}, + [&acc, chunkSize](auto& accessBlock) { return accessBlock.getAvailableSlots(acc, chunkSize); }); + } + + /** + * @brief Forwards to `getAvailableSlotsDeviceFunction` for interface compatibility reasons. See there for + * details. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(auto const& acc, uint32_t const chunkSize) + -> size_t + { + return getAvailableSlotsDeviceFunction(acc, chunkSize); + } + + protected: + // This class is supposed to be instantiated as a parent for the `DeviceAllocator`. + Heap() = default; + }; + + constexpr uint32_t defaultBlockSize = 128U * 1024U * 1024U; + constexpr uint32_t defaultPageSize = 128U * 1024U; + + /** + * @class DefaultHeapConfig + * @brief An example configuration for the heap. + * + * A heap configuration is supposed to provide the physical dimensions of the objects in the heap (i.e. access + * block and page) as well as a function that describes how much space you are willing to waste by allowing to + * allocate larger chunks that necessary. + * + * @tparam T_blockSize The size of one access block in bytes. + * @tparam T_pageSize The size of one page in bytes. + * @return + */ + template< + uint32_t T_blockSize = defaultBlockSize, + uint32_t T_pageSize = defaultPageSize, + uint32_t T_wasteFactor = 2U> + struct DefaultHeapConfig + { + static constexpr uint32_t const accessblocksize = T_blockSize; + static constexpr uint32_t const pagesize = T_pageSize; + static constexpr uint32_t const wastefactor = T_wasteFactor; + static constexpr bool const resetfreedpages = true; + + /** + * @brief Determine whether we want to allow an allocation of numBytes on a page with chunk size `chunkSize`. + * + * This function is given the currently requested allocation size numBytes and the set chunk size of a page. It + * answers the question whether we should consider this page for allocating this memory. It must necessarily + * return false if chunkSize < numBytes in order to avoid memory corruption. It may return true in cases where + * chunkSize > numBytes to trade off a bit of wasted memory for a performance boost while searching available + * memory. + * + * @param chunkSize Currently set chunk size of a page in number of bytes. + * @param numBytes Allocation size in number of bytes. + * @return true if the algorithm shall consider this page for allocation and false otherwise. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + return (chunkSize >= numBytes && chunkSize <= wastefactor * numBytes); + } + }; + + /** + * @class DefaultFlatterScatterHashConfig + * @brief An example configuration for the hash scattering. + * + * A scatter configuration is supposed to provide two pieces of information: A static function called `hash` and + * the compile-time constant `blockStride`. These are used by the creation policy to scatter the requests for + * memory within the heap. + * + */ + struct DefaultFlatterScatterHashConfig + { + public: + static constexpr uint32_t blockStride = 4; + + /** + * @brief Hash function to provide entropy for scattering memory requests. + * + * @param numBytes Number of bytes requested. + * @return A hash value. + */ + // TAcc is to be deduced, so we put it last. + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto hash(TAcc const& acc, uint32_t const numBytes) -> uint32_t + { + uint32_t const relative_offset = warpSize * numBytes / T_pageSize; + return ( + numBytes * hashingK + hashingDistMP * smid(acc) + + (hashingDistWP + hashingDistWPRel * relative_offset) * warpid(acc)); + } + + private: + static constexpr uint32_t hashingK = 38183; + static constexpr uint32_t hashingDistMP = 17497; + static constexpr uint32_t hashingDistWP = 1; + static constexpr uint32_t hashingDistWPRel = 1; + }; + + /** + * @class InitKernel + * @brief Kernel to initialise the heap memory. + * + * Used by the creation policy during initialisation. + */ + struct InitKernel + { + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()( + auto const& /*unused*/, + Heap* m_heap, + void* m_heapmem, + size_t const m_memsize) const + { + m_heap->accessBlocks + = static_cast::MyAccessBlock*>(m_heapmem); + m_heap->heapSize = m_memsize; + m_heap->init(); + } + }; + +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc + +namespace mallocMC::CreationPolicies +{ + /** + * @class FlatterScatter + * @brief A creation policy scattering memory requests in a flat hierarchy. + * + * This creation policy is a variation on the original ScatterAlloc algorithm and the one previously implemented in + * mallocMC. It provides a multi-level hierarchy of Heap, AccessBlock and DataPage that is traversed using the + * metadata held by each level to find a suitable memory location to satisfy the request. + * + * It uses a externally provided hash function to compute a single hash value for each request given its requested + * number of bytes and the accelerator. This is internally used to scatter the requests over the available memory + * and thereby improve the success rate for multi-threaded requests because different threads will start searching + * in different locations. + * + * Implemented as a thin wrapper around `Heap` that mainly provides interface compatibility with the calling code. + */ + template + struct FlatterScatterImpl + { + template + using AlignmentAwarePolicy = FlatterScatterAlloc::Heap; + + static auto classname() -> std::string + { + return "FlatterScatter"; + } + + static constexpr auto const providesAvailableSlots = true; + + /** + * @brief Check if a pointer returned from `create()` signals out-of-memory. + * + * @param pointer Pointer returned by `create()`. + * @return The boolean answer to this question. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto isOOM(void* pointer, uint32_t const /*unused size*/) -> bool + { + return pointer == nullptr; + } + + /** + * @brief initialise a raw piece of memory for use by the `Heap`. + * + * @param dev The alpaka device. + * @param queue The alpaka queue. + * @param heap The pointer to the `Heap` object located on the device. + * @param pool The pointer to the provided memory pool to be used by the `Heap` object. + * @param memsize The size of the pool memory in bytes. + */ + template + static void initHeap(auto& dev, auto& queue, auto* heap, void* pool, size_t memsize) + { + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + using VecType = alpaka::Vec; + + auto poolView = alpaka::createView(dev, reinterpret_cast(pool), alpaka::Vec(memsize)); + alpaka::memset(queue, poolView, 0U); + alpaka::wait(queue); + + auto workDivSingleThread + = alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}; + alpaka::exec(queue, workDivSingleThread, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize); + alpaka::wait(queue); + } + + /** + * @brief Count the number of possible allocation for the given slotSize directly from the host. + * + * This method implements the infrastructure to call `getAvailableSlotsDeviceFunction` on the `Heap` class. See + * there for details, particularly concerning the thread-safety of this. + * + * @param dev The alpaka device. + * @param queue The alpaka queue. + * @param slotSize The would-be-created memory size in number of bytes. + * @param heap Pointer to the `Heap` object that's supposed to handle the request. + * @return The number of allocations that would be successful with this slotSize. + */ + template + static auto getAvailableSlotsHost( + AlpakaDevice& dev, + AlpakaQueue& queue, + uint32_t const slotSize, + T_DeviceAllocator* heap) -> unsigned + { + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + using VecType = alpaka::Vec; + + auto d_slots = alpaka::allocBuf(dev, uint32_t{1}); + alpaka::memset(queue, d_slots, 0, uint32_t{1}); + auto d_slotsPtr = alpaka::getPtrNative(d_slots); + + auto getAvailableSlotsKernel = [heap, slotSize, d_slotsPtr] ALPAKA_FN_ACC(AlpakaAcc const& acc) -> void + { *d_slotsPtr = heap->getAvailableSlotsDeviceFunction(acc, slotSize); }; + + alpaka::wait(queue); + alpaka::exec( + queue, + alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}, + getAvailableSlotsKernel); + alpaka::wait(queue); + + auto const platform = alpaka::Platform{}; + auto const hostDev = alpaka::getDevByIdx(platform, 0); + + auto h_slots = alpaka::allocBuf(hostDev, Idx{1}); + alpaka::memcpy(queue, h_slots, d_slots); + alpaka::wait(queue); + + return *alpaka::getPtrNative(h_slots); + } + }; + + template< + typename T_HeapConfig = FlatterScatterAlloc::DefaultHeapConfig<>, + typename T_HashConfig = FlatterScatterAlloc::DefaultFlatterScatterHashConfig, + typename T_AlignmentPolicy = void> + struct FlatterScatter + { + template + using AlignmentAwarePolicy = FlatterScatterImpl; + + struct Properties + { + using HeapConfig = T_HeapConfig; + using HashConfig = T_HashConfig; + }; + }; + + +} // namespace mallocMC::CreationPolicies diff --git a/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp b/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp new file mode 100644 index 00000000..70e60bf7 --- /dev/null +++ b/src/include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp @@ -0,0 +1,823 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + + /** + * @class PageTable + * @brief Storage for AccessBlock's metadata + */ + template + struct PageTable + { + uint32_t chunkSizes[T_numPages]{}; + uint32_t fillingLevels[T_numPages]{}; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanup() -> void + { + std::fill(std::begin(chunkSizes), std::end(chunkSizes), 0U); + std::fill(std::begin(fillingLevels), std::end(fillingLevels), 0U); + } + }; + + /** + * @class Padding + * @brief Empty memory to pad the AccessBlock to the correct size + */ + template + struct Padding + { + char padding[T_size]{}; + }; + + /** + * @brief The C++ standard disallows zero-size arrays, so we specialise for this case. + */ + template<> + struct Padding<0U> + { + }; + + /** + * @class AccessBlock + * @brief Coarsest memory division unit containing fixed-size pages of raw memory and metadata about their chunk + * size and filling level + * + * @tparam T_HeapConfig A struct with compile-time information about the setup + * @tparam T_AlignmentPolicy The alignment policy in use for optimisation purposes + */ + template + class AccessBlock + { + protected: + static constexpr uint32_t const blockSize = T_HeapConfig::accessblocksize; + static constexpr uint32_t const pageSize = T_HeapConfig::pagesize; + static constexpr uint32_t const wasteFactor = T_HeapConfig::wastefactor; + static constexpr bool const resetfreedpages = T_HeapConfig::resetfreedpages; + + using MyPageInterpretation = PageInterpretation; + + // This class is supposed to be reinterpeted on a piece of raw memory and not instantiated directly. We set it + // protected, so we can still test stuff in the future easily. + AccessBlock() + { + init(); + } + + public: + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init() -> void + { + pageTable.cleanup(); + constexpr uint32_t dummyChunkSize = 1U; + for(auto& page : pages) + { + MyPageInterpretation(page, dummyChunkSize).cleanupFull(); + } + } + + /** + * @brief Compute the number of pages in the access block taking into account the space needed for metadata. + * + * @return The number of pages in the access block. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto numPages() -> uint32_t + { + constexpr auto numberOfPages = blockSize / (pageSize + sizeof(PageTable<1>)); + // check that the page table entries does not have a padding + static_assert(sizeof(PageTable) == numberOfPages * sizeof(PageTable<1>)); + return numberOfPages; + } + + /** + * @brief Answers the question: How many successful allocations with the given size are still possible? + * CAUTION: Not thread-safe! + * + * This method looks up the metadata for all its pages and computes the number of available slots with the + * given chunk size. By doing so, the information this method is queried for is inherently not thread-safe + * because if other threads are (de-)allocating memory during this look up, the information about each + * individual page will be stale as soon as it is retrieved. However, beyond this inherent non-thread-safety we + * made no effort so far to leverage parallelism or make it use atomics, i.e., move into the direction of + * consistency in the multi-threaded case. It is supposed to run in a single thread without any interference. + * + * @param chunkSize The number of bytes the would-be allocations request + * @return The number of available slots with this chunk size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableSlots(auto const& acc, uint32_t const chunkSize) const + -> uint32_t + { + if(chunkSize < multiPageThreshold()) + { + return getAvailableChunks(acc, chunkSize); + } + return getAvailableMultiPages(acc, chunkSize); + } + + /** + * @brief Compute the index of the page a pointer points to. + * + * @param pointer Memory location inside of the data part of this access block. + * @return The index of the page this pointer points to. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto pageIndex(void* pointer) const -> int32_t + { + return indexOf(pointer, pages, pageSize); + } + + /** + * @brief Verifies that a pointer points to a valid piece of memory. CAUTION: Not thread-safe! + * + * This method checks if a pointer is valid, meaning that it points to a chunk of memory that is marked as + * allocated. The information it provides is inherently not thread-safe because if other threads are operating + * on the memory, the retrieved information is stale the moment it was looked up. It is, however, consistent in + * that it uses atomics to retrieve this information, so if the pointer is valid and does not get destroyed + * between looking up the answer and using it (for example in the scenario where I'm the only one knowing about + * this pointer), the answer is valid. + * + * @param pointer Pointer to validate + * @return true if the pointer is valid else false + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, void* const pointer) -> bool + { + if(pointer == nullptr) + { + return false; + } + auto const index = pageIndex(pointer); + auto chunkSize = atomicLoad(acc, pageTable.chunkSizes[index]); + if(chunkSize >= pageSize) + { + return true; + } + return chunkSize == 0U or atomicLoad(acc, pageTable.fillingLevels[index]) == 0U + ? false + : interpret(index, chunkSize).isValid(acc, pointer); + } + + /** + * @brief Allocate a piece of memory of the given size. + * + * This method attempts to allocate a piece of memory of (at least) numBytes bytes. The actual size might be + * larger (depending on the user-provided compile-time configuration of the AccessBlock) but is not + * communicated, so it is not allowed to access the pointer outside the requested range. It returns a nullptr + * if there is no memory available. The hashValue is used to scatter memory accesses. A cheap operation will be + * performed to transform it into a page index to start the search at. It is also handed to the lower levels to + * be used similarly. Having it default to 0 makes it easier for testing. The effect of this method is reverted + * by the destroy method. + * + * @param numBytes Required size of memory in bytes + * @param hashValue Optional number to scatter memory access. + * @return A pointer to an allocated piece of memory or nullptr if no memory is available + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create( + TAcc const& acc, + uint32_t const numBytes, + uint32_t const hashValue = 0U) -> void* + { + void* pointer{nullptr}; + if(numBytes >= multiPageThreshold()) + { + pointer = createOverMultiplePages(acc, numBytes, hashValue); + } + else + { + pointer = createChunk(acc, numBytes, hashValue); + } + return pointer; + } + + /** + * @brief Free up the memory a valid pointer points to. + * + * This method attempts to destroy the memory of a valid pointer created by the create method. It reverses the + * effect of the create method and makes the allocated memory available for re-allocation. After calling this + * method on a pointer it is invalid and may no longer be used for memory access. Invalid pointers are ignored + * and a failure of this method is not communicated in production. In debug mode various exceptions can be + * thrown for different forms of invalid pointers. + * + * @param pointer A pointer created by the create method. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* const pointer) -> void + { + auto const index = pageIndex(pointer); + if(index >= static_cast(numPages()) || index < 0) + { +#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + throw std::runtime_error{ + "Attempted to destroy an invalid pointer! Pointer does not point to any page."}; +#endif // NDEBUG + return; + } + auto const chunkSize = atomicLoad(acc, pageTable.chunkSizes[index]); + if(chunkSize >= multiPageThreshold()) + { + destroyOverMultiplePages(acc, index, chunkSize); + } + else + { + destroyChunk(acc, pointer, index, chunkSize); + } + } + + private: + DataPage pages[numPages()]{}; + PageTable pageTable{}; + Padding padding{}; + + /** + * @brief The number of bytes at which allocation switch to "multi-page mode", i.e., allocate full pages. + * + * It is obvious that this number can be at most page size subtracted by the size of one bit mask. There is, + * however, no strict lower bound because we theoretically disregard the lower levels completely by this + * switch. If we reasonably assume that our lower hierarchy levels add value (i.e. performance) to our + * implementation, a reasonable lower bound would be the size at which only a single allocation fits onto a + * page. This method could be used for fine-tuning performance in that sense. + * + * @return The number of bytes at which to switch to multi-page mode. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto multiPageThreshold() -> uint32_t + { + return ceilingDivision(pageSize - sizeof(BitMaskStorageType<>), 2U); + } + + /** + * @brief Convenience method that creates a PageInterpretation from a page identified by its page index and a + * chunk size. + * + * @param pageIndex Identifies the page in the array of raw pages. + * @param chunkSize Chunk size for which to interpret the page. + * @return A page interpretation of the requested page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto interpret(uint32_t const pageIndex, uint32_t const chunkSize) + { + return MyPageInterpretation(pages[pageIndex], chunkSize); + } + + /** + * @brief Branch of getAvailableSlots for chunk sizes below the multi-page threshold. See there for details. + * + * @param chunkSize Would-be allocation size to test for. + * @return Number of allocations that would succeed with this size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableChunks(auto const& acc, uint32_t const chunkSize) const + -> uint32_t + { + // TODO(lenz): This is not thread-safe! + return std::transform_reduce( + std::cbegin(pageTable.chunkSizes), + std::cend(pageTable.chunkSizes), + std::cbegin(pageTable.fillingLevels), + 0U, + std::plus{}, + [this, &acc, chunkSize](auto const localChunkSize, auto const fillingLevel) + { + auto const numChunks + = MyPageInterpretation::numChunks(localChunkSize == 0 ? chunkSize : localChunkSize); + return ((this->isInAllowedRange(acc, localChunkSize, chunkSize) or localChunkSize == 0U) + and fillingLevel < numChunks) + ? numChunks - fillingLevel + : 0U; + }); + } + + /** + * @brief Branch of getAvailableSlots for chunk sizes above the multi-page threshold. See there for details. + * + * @param chunkSize Would-be allocation size to test for. + * @return Number of allocations that would succeed with this size. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getAvailableMultiPages(auto const& /*acc*/, uint32_t const chunkSize) const + -> uint32_t + { + // TODO(lenz): This is not thread-safe! + auto numPagesNeeded = ceilingDivision(chunkSize, pageSize); + if(numPagesNeeded > numPages()) + { + return 0U; + } + uint32_t sum = 0U; + for(uint32_t i = 0; i < numPages() - numPagesNeeded + 1;) + { + if(std::all_of( + pageTable.chunkSizes + i, + pageTable.chunkSizes + i + numPagesNeeded, + [](auto const& val) { return val == 0U; })) + { + sum += 1; + i += numPagesNeeded; + } + else + { + ++i; + } + } + return sum; + } + + /** + * @brief Creation algorithm in multi-page mode. + * + * In this mode, we have decided to ignore all the lower level hierarchy. The algorithm simplifies accordingly + * and a few optimisations can be done. It can however be quite cumbersome to find a sufficient number of + * contiguous pages, so this will likely be most performant for small sizes. + * + * @param numBytes Required allocation size in number of bytes. + * @param hashValue A hash value used to scatter memory access. + * @return Pointer to a valid piece of memory or nullptr if no such memory was found. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto createOverMultiplePages( + auto const& acc, + uint32_t const numBytes, + uint32_t hashValue) -> void* + { + auto numPagesNeeded = ceilingDivision(numBytes, +pageSize); + if(numPagesNeeded > numPages()) + { + return static_cast(nullptr); + } + + // We take a little head start compared to the chunked case in order to not have them interfere with our + // laborious search for contiguous pages. + auto startIndex = startPageIndex(acc, hashValue) + numPagesNeeded; + return wrappingLoop( + acc, + startIndex, + numPages() - (numPagesNeeded - 1), + static_cast(nullptr), + [&](auto const& localAcc, auto const& firstIndex) + { + void* result{nullptr}; + auto numPagesAcquired = acquirePages(localAcc, firstIndex, numPagesNeeded); + if(numPagesAcquired == numPagesNeeded) + { + // At this point, we have acquired all the pages we need and nobody can mess with them anymore. + // We still have to set the chunk size correctly. + setChunkSizes(localAcc, firstIndex, numPagesNeeded, numBytes); + result = &pages[firstIndex]; + } + else + { + releasePages(localAcc, firstIndex, numPagesAcquired); + } + return result; + }); + } + + /** + * @brief Short-circuiting acquisition of multiple contiguous pages. + * + * The algorithm attempts to acquire the requested number of pages starting from firstIndex locking them by + * setting their filling level to page size. It returns when either all requested pages are acquired or an + * already occupied page was hit. In either case, it returns the number of successful acquisitions. This method + * does not clean up after itself, i.e., it does not release the pages in case of failure. + * + * @param firstIndex Start index of the array of contiguous pages. + * @param numPagesNeeded Number of pages to be acquired. + * @return Number of pages that were successfully acquired. This is smaller than numPagesNeeded if the method + * failed. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto acquirePages( + auto const& acc, + uint32_t const firstIndex, + uint32_t const numPagesNeeded) -> uint32_t + { + uint32_t index = 0U; + uint32_t oldFilling = 0U; + for(index = 0U; index < numPagesNeeded; ++index) + { + oldFilling = alpaka::atomicCas(acc, &pageTable.fillingLevels[firstIndex + index], 0U, +pageSize); + if(oldFilling != 0U) + { + break; + } + } + return index; + } + + /** + * @brief Counterpart to acquirePages for doing the clean-up in case of failure. + * + * This method starts from page firstIndex and releases the lock of numPagesAcquired contiguous pages. This is + * supposed to be called in the case of failure of acquirePages to release the already acquired pages. + * + * @param firstIndex Start index of the array of contiguous pages. + * @param numPagesAcquired Number of pages to be released. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto releasePages( + auto const& acc, + uint32_t const firstIndex, + uint32_t const numPagesAcquired) -> void + { + for(uint32_t index = 0U; index < numPagesAcquired; ++index) + { + alpaka::atomicSub(acc, &pageTable.fillingLevels[firstIndex + index], +pageSize); + } + } + + /** + * @brief Set the chunk sizes of a contiguous array of pages. + * + * This function assumes that all the pages are locked by the current thread and performs a hard set operation + * without checking the previous content. + * + * @param firstIndex Start index of the contiguous array of pages. + * @param numPagesNeeded The number of pages to set the chunk size on. + * @param numBytes Chunk size to be set in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto setChunkSizes( + auto const& acc, + uint32_t const firstIndex, + uint32_t const numPagesNeeded, + uint32_t const numBytes) -> void + { + for(uint32_t numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired) + { + // At this point in the code, we have already locked all the pages. So, we literally don't care what + // other threads thought this chunk size would be because we are the only ones legitimately messing + // with this page. This chunk size may be non-zero because we could have taken over a page before it + // was properly cleaned up. That is okay for us because we're handing out uninitialised memory anyways. + // But it is very important to record the correct chunk size here, so the destroy method later on knows + // how to handle this memory. + alpaka::atomicExch(acc, &pageTable.chunkSizes[firstIndex + numPagesAcquired], numBytes); + } + } + + /** + * @brief Special return value for an unsuccessful search of available pages. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto noFreePageFound() + { + return numPages(); + } + + /** + * @brief Compute an index where to start searching for a free page from a hash value. + * + * @param hashValue Hash value to introduce some entropy here. + * @return Start index for searching a free page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto startPageIndex(auto const& /*acc*/, uint32_t const hashValue) + { + return (hashValue >> 8U) % numPages(); + } + + /** + * @brief Helper that combines the necessary checks to ensure a page index is valid. + * + * @param index The page index to check. + * @return true if the page index is valid and false otherwise + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValidPageIdx(uint32_t const index) const -> bool + { + return index != noFreePageFound() && index < numPages(); + } + + /** + * @brief Main algorithm to create a chunk of memory on a page. + * + * This is the main algorithm for creating a chunk of memory. It searches for a free page and instructs it to + * create some memory. If successful, it returns this pointer. If not, it searches on. + * + * @param numBytes Number of bytes required. + * @param hashValue A hash value used to scatter the memory accesses. + * @return A pointer to a valid piece of memory or nullptr if no available memory could be found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto createChunk( + TAcc const& acc, + uint32_t const numBytes, + uint32_t const hashValue) -> void* + { + auto index = startPageIndex(acc, hashValue); + + // Under high pressure, this loop could potentially run for a long time because the information where and + // when we started our search is not maintained and/or used. This is a feature, not a bug: Given a + // consistent state, the loop will terminate once a free chunk is found or when all chunks are filled for + // long enough that `choosePage` could verify that each page is filled in a single run. + // + // The seemingly non-terminating behaviour that we wrap around multiple times can only occur (assuming a + // consistent, valid state of the data) when there is high demand for memory such that pages that appear + // free to `choosePage` are repeatedly found but then the free chunks are scooped away by other threads. + // + // In the latter case, it is considered desirable to wrap around multiple times until the thread was fast + // enough to acquire some memory. + void* pointer = nullptr; + do + { + // TODO(lenz): This can probably be index++. + index = (index + 1) % numPages(); + uint32_t chunkSize = numBytes; + index = choosePage(acc, numBytes, index, chunkSize); + if(isValidPageIdx(index)) + { + pointer = MyPageInterpretation{pages[index], chunkSize}.create(acc, hashValue); + if(pointer == nullptr) + { + leavePage(acc, index); + } + } + } while(isValidPageIdx(index) and pointer == nullptr); + return pointer; + } + + /** + * @brief Main loop running over all pages checking for available ones. + * + * It is important to stress that the information about availability of the returned page is already stale when + * it is returned. Thus, it can well happen that an actual allocation attempt on this page still fails, e.g., + * because another thread was faster and scooped away that piece of memory. + * + * @param numBytes Required allocation size in number of bytes. + * @param startIndex Index of the page to start the search from. + * @param chunkSizeCache A memory location to store a local copy of the current chunk size. Used for + * optimisation by reducing the number of atomic lookups. + * @return A page index to a potntially available page or noFreePageFound() if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto choosePage( + TAcc const& acc, + uint32_t const numBytes, + uint32_t const startIndex, + uint32_t& chunkSizeCache) -> uint32_t + { + return wrappingLoop( + acc, + startIndex, + numPages(), + noFreePageFound(), + [this, numBytes, &chunkSizeCache](auto const& localAcc, auto const index) { + return this->thisPageIsSuitable(localAcc, index, numBytes, chunkSizeCache) ? index + : noFreePageFound(); + }); + } + + /** + * @brief Helper function combining checks to match the requested number of bytes with a found chunk size + * taking into account the waste factor. + * + * @param chunkSize Actually found chunk sizes of a page in number of bytes + * @param numBytes Requested allocation size in number of bytes. + * @return + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isInAllowedRange( + auto const& acc, + uint32_t const chunkSize, + uint32_t const numBytes) const + { + return T_HeapConfig::isInAllowedRange(acc, chunkSize, numBytes); + } + + /** + * @brief Checks if a page is usable for allocation of numBytes and enters it. + * + * This method looks up the metdata of the page identified by its index to check if we can hope for a + * successful allocation there. In doing so, it enters the page (i.e. increments its filling level) and, if + * necessary, already sets the correct chunk size. In a multi-threaded context the separate concerns of + * checking and setting cannot be split because the information used for the check would already be stale at + * the time of setting anything. If it returns true, the filling level and chunk sizes are thus suitable for + * proceeding further and the caller is responsible for cleaning up appropriately if a failure at a later stage + * occurs. If it returns false, it has already cleaned up everything itself and there is no further action + * required on the caller's side. + * + * @param index Index to identify the page among the raw data pages. + * @param numBytes Requested allocation size in number of bytes. + * @param chunkSizeCache A memory location to store a local copy of the current chunk size. Used for + * optimisation by reducing the number of atomic lookups. + * @return true if the page is suitable and false otherwise + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto thisPageIsSuitable( + TAcc const& acc, + uint32_t const index, + uint32_t const numBytes, + uint32_t& chunkSizeCache) -> bool + { + bool suitable = false; + auto oldFilling = enterPage(acc, index); + + // At this point, we're only testing against our desired `numBytes`. Due to the `wastefactor` the actual + // `chunkSize` of the page might be larger and, thus, the actual `numChunks` might be smaller than what + // we're testing for here. But if this fails already, we save one atomic. + if(oldFilling < MyPageInterpretation::numChunks(numBytes)) + { + uint32_t oldChunkSize = alpaka::atomicCas(acc, &pageTable.chunkSizes[index], 0U, numBytes); + chunkSizeCache = oldChunkSize == 0U ? numBytes : oldChunkSize; + + // Now that we know the real chunk size of the page, we can check again if our previous assessment was + // correct. But first we need to make sure that we are actually in chunked mode. This will be redundant + // with the second check in most situations because we usually would choose a multi-page threshold that + // would not switch to multi-page mode while more than one chunk fits on the page but this is a design + // decision that could change in the future. + if(oldChunkSize < multiPageThreshold() + and oldFilling < MyPageInterpretation::numChunks(chunkSizeCache)) + { + suitable = isInAllowedRange(acc, chunkSizeCache, numBytes); + } + } + if(not suitable) + { + leavePage(acc, index); + } + return suitable; + } + + /** + * @brief Counterpart to createChunk freeing up a piece of memory in the chunked mode. See destroy for details. + * + * This is the most difficult part of the algorithm. We will successively remove our metadata from the various + * levels and must be extra careful which information we can still rely on. Most of this complexity is captured + * in leavePage. + * + * @param pointer Pointer to a valid piece of memory created by createChunk. + * @param pageIndex Index of the page the pointer points to. Supplying this is an optimisation because it was + * already computed on a higher level in the call stack. This information would already be contained in + * pointer. + * @param chunkSize Chunk size of the page we're operating on. This is potentially different from the size of + * memory the pointer points to due to the waste factor. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void destroyChunk( + TAcc const& acc, + void* pointer, + uint32_t const pageIndex, + uint32_t const chunkSize) + { + auto page = interpret(pageIndex, chunkSize); + page.destroy(acc, pointer); + leavePage(acc, pageIndex); + } + + /** + * @brief Enter a page for any purpose. + * + * This method is very important. We maintain the invariant that any potentially writing access to a page + * starts by entering and ends by leaving a page. These are currently implemented as updating the filling level + * accordingly. You are not allowed to touch a page unless you have entered it (although multi-page mode uses a + * shortcut here). This implies that we always have to check the filling level before checking for the chunk + * size. + * + * @param pageIndex Identifies the page in the array of raw data pages. + * @return The old filling level for further checks. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto enterPage(TAcc const& acc, uint32_t const pageIndex) -> uint32_t + { + auto const oldFilling = alpaka::atomicAdd(acc, &pageTable.fillingLevels[pageIndex], 1U); + // We assume that this page has the correct chunk size. If not, the chunk size is either 0 (and oldFilling + // must be 0, too) or the next check will fail. + return oldFilling; + } + + /** + * @brief Leave a page after any potentially modifying operation on it. + * + * This method must be called whenever you have entered a page (using enterPage()). This is a very subtle and + * error-prone method because we are successively removing metadata and need to be extra careful which + * information and guards we can still trust. In the simplest case, there's not much to do but decrease the + * filling level but potentially we're the last thread on the page and need to clean up remaining metadata for + * the threads to come. In that case, we explicitly allow for threads to take over the page as-is to spare us + * the trouble of cleaning up. But doing so opens up many subtle ways of reordering memory accesses. Also, we + * cannot rely in much previous information (like chunk sizes looked up earlier) because other threads might + * have already updated them. Be warned! + * + * @param pageIndex Identifies the page in the array of raw data pages. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void leavePage(TAcc const& acc, uint32_t const pageIndex) + { + // This outermost atomicSub is an optimisation: We can fast-track this if we are not responsible for the + // clean-up. Using 0U -> 1U in the atomicCAS and comparison further down would have the same effect (if the + // else branch contained the simple subtraction). It's a matter of which case shall have one operation + // less. + auto originalFilling = alpaka::atomicSub(acc, &pageTable.fillingLevels[pageIndex], 1U); + + if constexpr(resetfreedpages) + { + if(originalFilling == 1U) + { + // CAUTION: This section has caused a lot of headaches in the past. We're in a state where the + // filling level is 0 but we have not properly cleaned up the page and the metadata yet. This is on + // purpose because another thread might still take over this page and spare us the trouble of + // freeing everything up properly. But this other thread must take into account the possibility + // that it acquired a second-hand page. Look here if you run into another deadlock. It might well + // be related to this section. + + auto lock = pageSize; + auto latestFilling = alpaka::atomicCas(acc, &pageTable.fillingLevels[pageIndex], 0U, lock); + if(latestFilling == 0U) + { + auto chunkSize = atomicLoad(acc, pageTable.chunkSizes[pageIndex]); + if(chunkSize != 0) + { + // At this point it's guaranteed that the fiilling level is numChunks and thereby locked. + // Furthermore, chunkSize cannot have changed because we maintain the invariant that the + // filling level is always considered first, so no other thread can have passed that + // barrier to reset it. + MyPageInterpretation{pages[pageIndex], chunkSize}.cleanupUnused(); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + + // It is important to keep this after the clean-up line above: Otherwise another thread + // with a smaller chunk size might circumvent our lock and already start allocating before + // we're done cleaning up. + alpaka::atomicCas(acc, &pageTable.chunkSizes[pageIndex], chunkSize, 0U); + } + + // TODO(lenz): Original version had a thread fence at this point in order to invalidate + // potentially cached bit masks. Check if that's necessary! + + // At this point, there might already be another thread (with another chunkSize) on this page + // but that's fine. It won't see the full capacity but we can just subtract what we've added + // before: + alpaka::atomicSub(acc, &pageTable.fillingLevels[pageIndex], lock); + } + } + } + } + + /** + * @brief Counterpart to createOverMultiplePages, freeing up memory in multi-page mode. + * + * This method is way simpler than its chunked version because in multi-page mode we have a hard lock on the + * pages we acquired and are free to manipulate them to our will. We just make sure that releasing this lock is + * the last operation we perform. + * + * @param pageIndex Identifies the first page in the array of raw data pages. + * @param chunkSize The chunk size set on that first page (i.e. the original allocation size). + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC void destroyOverMultiplePages( + auto const& acc, + uint32_t const pageIndex, + uint32_t const chunkSize) + { + auto numPagesNeeded = ceilingDivision(chunkSize, pageSize); + for(uint32_t i = 0; i < numPagesNeeded; ++i) + { + auto myIndex = pageIndex + i; + // Everything inside the following scope is done to reset the free'd pages. As opposed to the chunked + // case, we decided to always perform a reset in multi-page mode regardless of the value of + // `resetfreedpages`. If you want to reinstate the old behaviour or add a second parameter + // specifically for multi-page mode, e.g., resetreedpages_multipage, just put an `if constexpr` around + // here again. + { + MyPageInterpretation{pages[myIndex], T_AlignmentPolicy::Properties::dataAlignment}.cleanupFull(); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + alpaka::atomicCas(acc, &pageTable.chunkSizes[myIndex], chunkSize, 0U); + } + alpaka::atomicSub(acc, &pageTable.fillingLevels[myIndex], +pageSize); + } + } + }; + +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp b/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp new file mode 100644 index 00000000..c7596c07 --- /dev/null +++ b/src/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp @@ -0,0 +1,533 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include +#include + +#include + +#include +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + namespace detail + { + template + struct BitMaskStorageTypes + { + using type = void; + }; + + template<> + struct BitMaskStorageTypes<16U> + { + using type = uint16_t; + }; + + template<> + struct BitMaskStorageTypes<32U> + { + using type = uint32_t; + }; + + template<> + struct BitMaskStorageTypes<64U> + { + using type = uint64_t; + }; + } // namespace detail + + /** + * @brief Number of bits in a bit mask. Most likely you want a power of two here. + */ + constexpr uint32_t const BitMaskSize = 32U; + + /** + * @brief Type to store the bit masks in. It's implemented as a template in order to facilitate changing the type + * depending on BitMaskSize. Use it with its default template argument in order to make your code agnostic of the + * number configured in BitMaskSize. (Up to providing a template implementation, of course.) + */ + template + using BitMaskStorageType = detail::BitMaskStorageTypes::type; + + /** + * @brief Represents a completely filled bit mask, i.e., all bits are one. + */ + template + static constexpr BitMaskStorageType const allOnes = std::numeric_limits>::max(); + + /** + * @brief Return the bit mask's underlying type with a single bit set (=1) at position index and all others unset + * (=0). + * + * @param index Position of the single bit set. + * @return Bit mask's underlying type with one bit set. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto singleBit(uint32_t const index) -> BitMaskStorageType + { + return BitMaskStorageType{1U} << index; + } + + /** + * @brief Return the bit mask's underlying type with all bits up to index from the right are set (=1) and all + * higher bits are unset (=0). + * + * @param index Number of set bits. + * @return Bit mask's underlying type with index bits set. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto allOnesUpTo(uint32_t const index) -> BitMaskStorageType + { + return index == 0 ? 0 : (allOnes >> (size - index)); + } + + /** + * @class BitMaskImpl + * @brief Represents a bit mask basically wrapping the BitMaskStorageType<>. + * + * This class basically provides a convenience interface to the (typically integer) type BitMaskStorageType<> for + * bit manipulations. It was originally modelled closely after std::bitset which is not necessarily available on + * device for all compilers, etc. + * + * Convention: We start counting from the right, i.e., if mask[0] == 1 and all others are 0, then mask = 0...01 + * + * CAUTION: This convention is nowhere checked and we might have an implicit assumption on the endianess here. We + * never investigated because all architectures we're interested in have the same endianess and it works on them. + * + */ + template + struct BitMaskImpl + { + BitMaskStorageType mask{}; + + /** + * @return An invalid bit index indicating the failure of a search in the bit mask. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto noFreeBitFound() -> uint32_t + { + return MyBitMaskSize; + } + + /** + * @brief Look up if the index-th bit is set. + * + * @param index Bit position to check. + * @return true if bit is set else false. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto const index) -> bool + { + return (atomicLoad(acc, mask) & singleBit(index)) != BitMaskStorageType{0U}; + } + + /** + * @brief Set all bits (to 1). + * + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto set(TAcc const& acc) -> BitMaskStorageType + { + return alpaka::atomicOr( + acc, + &mask, + static_cast>(+allOnes)); + } + + /** + * @brief Set the index-th bit (to 1). + * + * @param index Bit position to set. + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto set(TAcc const& acc, auto const index) + { + return alpaka::atomicOr(acc, &mask, singleBit(index)); + } + + /** + * @brief Unset the index-th bit (set it to 0). + * + * @param index Bit position to unset. + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto unset(TAcc const& acc, auto const index) + { + return alpaka::atomicAnd( + acc, + &mask, + static_cast>( + allOnes ^ singleBit(index))); + } + + /** + * @brief Flip all bits in the mask. + * + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto flip(TAcc const& acc) + { + return alpaka::atomicXor( + acc, + &mask, + static_cast>(+allOnes)); + } + + /** + * @brief Flip the index-th bits in the mask. + * + * @param index Bit position to flip. + * @return Previous mask. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto flip(TAcc const& acc, auto const index) + { + return alpaka::atomicXor( + acc, + &mask, + static_cast>(singleBit(index))); + } + + /** + * @brief Compare with another mask represented by a BitMaskStorageType<>. CAUTION: This does not use atomics + * and is not thread-safe! + * + * This is not implemented thread-safe because to do so we'd need to add the accelerator as a function argument + * and that would not abide by the interface for operator==. It's intended use is to make (single-threaded) + * tests more readable, so that's not an issue. + * + * @param other Mask to compare with. + * @return true if all bits are identical else false. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator==(BitMaskStorageType const other) const -> bool + { + return (mask == other); + } + + /** + * @brief Spaceship operator comparing with other bit masks. CAUTION: This does not use atomics and is not + * thread-safe! See operator== for an explanation. + * + * @param other Bit mask to compare with. + * @return Positive if this mask > other mask, 0 for equality, negative otherwise. + */ + // My version of clang cannot yet handle the spaceship operator apparently: + // clang-format off + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator<=> (BitMaskImpl const other) const + // clang-format on + { + return (mask - other.mask); + } + + /** + * @brief Check if no bit is set (=1). + * + * @return true if no bit is set else false. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto none() const -> bool + { + return mask == 0U; + } + + /** + * @brief Interface to the main algorithm of finding a free bit. + * + * This algorithm searches for an unset bit and returns its position as an index (which is supposed to be + * translated into a corresponding chunk by the PageInterpretation). Upon doing so, it also sets this bit + * because in a multi-threaded context we cannot separate the concerns of retrieving information and acting on + * the information. It takes a start index that acts as an initial guess but (in the current implementation) it + * does not implement a strict wrapping loop as the other stages do because this would waste valuable + * information obtained from the collective operation on all bits in the mask. + * + * Additionally, it copes with partial masks by ignoring all bit positions beyond numValidBits. + * + * @param numValidBits Bit positions beyond this number will be ignored. + * @param initialGuess Initial guess for the first look up. + * @return Bit position of a free bit or noFreeBitFound() in the case of none found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBit( + TAcc const& acc, + uint32_t const numValidBits = MyBitMaskSize, + uint32_t const initialGuess = 0) -> uint32_t + { + return firstFreeBitWithInitialGuess(acc, initialGuess % MyBitMaskSize, numValidBits); + } + + private: + /** + * @brief Implementation of the main search algorithm. See the public firstFreeBit method for general details. + * This version assumes a valid range of the input values. + * + * @param initialGuess Initial guess for the first look up must be in the range [0;MyBitMaskSize). + * @param endIndex Maximal position to consider. Bits further out will be ignored. + * @return Bit position of a free bit or noFreeBitFound() in the case of none found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBitWithInitialGuess( + TAcc const& acc, + uint32_t const initialGuess, + uint32_t const endIndex) -> uint32_t + { + auto result = noFreeBitFound(); + BitMaskStorageType oldMask = 0U; + + // This avoids a modulo that's not a power of two and is faster thereby. + auto const selectedStartBit = initialGuess >= endIndex ? 0U : initialGuess; + for(uint32_t i = selectedStartBit; i < endIndex and result == noFreeBitFound();) + { + oldMask = alpaka::atomicOr(acc, &mask, singleBit(i)); + if((oldMask & singleBit(i)) == 0U) + { + result = i; + } + + // In case of no free bit found, this will return -1. Storing it in a uint32_t will underflow and + // result in 0xffffffff but that's okay because it also ends the loop as intended. + i = alpaka::ffs(acc, static_cast>>(~oldMask)) - 1; + } + + return result; + } + }; + + using BitMask = BitMaskImpl; + + /** + * @class BitFieldFlat + * @brief Represents a (non-owning) bit field consisting of multiple bit masks. + * + * This class interprets a piece of memory as an array of bit masks and provides convenience functionality to act + * on them as a long array of bits. Most importantly, it provides an interface to find a free bit. It is a + * non-owning view of the memory! + * + * Please note, that methods usually (unless stated otherwise) refer to bits counting all bits from the start of + * the bit field, so if BitMask size is 32 and index=34=31+3, we're checking for the third bit of the second mask + * (if masks was a matrix this would be equivalent to: masks[1][2]). + * + */ + template + struct BitFieldFlatImpl + { + std::span> data; + + /** + * @brief Check if the index-th bit in the bit field is set (=1). + * + * @param index Bit position to check. + * @return true if bit is set else false. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto get(TAcc const& acc, uint32_t index) const -> bool + { + return data[index / MyBitMaskSize](acc, index % MyBitMaskSize); + } + + /** + * @brief Get the index-th mask NOT bit (counting in number of masks and not bits). + * + * @param index Position of the mask. + * @return Requested mask. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto getMask(uint32_t const index) const -> BitMaskImpl& + { + return data[index]; + } + + /** + * @brief Set the index-th bit (to 1). + * + * @param index Position of the bit. + * @return + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void set(TAcc const& acc, uint32_t const index) const + { + data[index / MyBitMaskSize].set(acc, index % MyBitMaskSize); + } + + /** + * @brief Counterpart to set, unsetting (to 0) to index-th bit. + * + * @tparam TAcc + * @param acc + * @param index + * @return + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void unset(TAcc const& acc, uint32_t const index) const + { + data[index / MyBitMaskSize].unset(acc, index % MyBitMaskSize); + } + + /** + * @return Begin iterator to the start of the array of masks, iterating over masks NOT bits. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto begin() const + { + return std::begin(data); + } + + /** + * @return End iterator to the start of the array of masks, iterating over masks NOT bits. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto end() const + { + return std::end(data); + } + + /** + * @brief Count the number of masks. + * + * @return Number of masks. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numMasks() const + { + return data.size(); + } + + /** + * @brief Count the number of bits in the array of masks. + * + * This does not take into account if bits are valid or not, so this is always a multiple of the MyBitMaskSize + * currently. + * + * @return Number of bits. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBits() const + { + return numMasks() * MyBitMaskSize; + } + + /** + * @brief Main algorithm for finding and setting a free bit in the bit field. + * + * This iterates through the masks wrapping around from the given startIndex. The information of how many bits + * are valid is passed through the lower levels which automatically discard out of range results (accounting of + * partially filled masks). As always, we can't separate the concerns of retrieving information and acting on + * it in a multi-threaded context, so if a free bit is found it is immediately set. + * + * @param numValidBits Number of valid bits in the bit field (NOT masks, i.e. it's equal to numChunks() on the + * page). Should typically be a number from the range [MyBitMaskSize * (numMasks()-1) + 1, MyBitMaskSize * + * numMasks()) although other numbers shouldn't hurt. + * @param startIndex Bit mask (NOT bit) to start the search at. + * @return The index of the free bit found (and set) or noFreeBitFound() if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBit( + TAcc const& acc, + uint32_t numValidBits, + uint32_t const startIndex = 0U) -> uint32_t + { + return wrappingLoop( + acc, + startIndex % numMasks(), + numMasks(), + noFreeBitFound(), + [this, numValidBits](TAcc const& localAcc, auto const index) + { + auto tmp = this->firstFreeBitAt(localAcc, numValidBits, index); + return tmp; + }); + } + + /** + * @return Special invalid bit index to indicate that no free bit was found. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto noFreeBitFound() const -> uint32_t + { + return numBits(); + } + + private: + /** + * @return Position inside of a mask to start the search at. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto startBitIndex() + { + return laneid(); + } + + /** + * @brief Helper function checking if we're in the last mask. + * + * @param numValidBits Number of valid bits in the bit field. The mask containing this bit is the last mask. + * @param maskIndex Index of the mask under consideration (NOT bit). + * @return true if the mask is the last mask else false. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto isThisLastMask( + uint32_t const numValidBits, + uint32_t const maskIndex) + { + // >= in case index == numValidBits - MyBitMaskSize + return (maskIndex + 1) * MyBitMaskSize >= numValidBits; + } + + /** + * @brief Implementation of the main algorithm asking a mask of a free bit and checking if the answer is valid. + * + * @param numValidBits Number of valid bits in the bit field. + * @param maskIdx Index of the maks under consideration. + * @return Index of the free bit found IN THE BITFIELD (not only in the mask, so this value can be larger than + * MyBitMaskSize) or noFreeBitFound() if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto firstFreeBitAt( + TAcc const& acc, + uint32_t const numValidBits, + uint32_t const maskIdx) -> uint32_t + { + auto numValidBitsInLastMask = (numValidBits ? ((numValidBits - 1U) % MyBitMaskSize + 1U) : 0U); + auto indexInMask = getMask(maskIdx).firstFreeBit( + acc, + isThisLastMask(numValidBits, maskIdx) ? numValidBitsInLastMask : MyBitMaskSize, + startBitIndex()); + if(indexInMask < BitMaskImpl::noFreeBitFound()) + { + uint32_t freeBitIndex = indexInMask + MyBitMaskSize * maskIdx; + if(freeBitIndex < numValidBits) + { + return freeBitIndex; + } + } + return noFreeBitFound(); + } + }; + + using BitFieldFlat = BitFieldFlatImpl; +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp b/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp new file mode 100644 index 00000000..9f20c7d0 --- /dev/null +++ b/src/include/mallocMC/creationPolicies/FlatterScatter/DataPage.hpp @@ -0,0 +1,42 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + /** + * @class DataPage + * @brief Raw piece of memory of size T_pageSize + */ + template + struct DataPage + { + char data[T_pageSize]{}; + }; +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp b/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp new file mode 100644 index 00000000..3f0bf82c --- /dev/null +++ b/src/include/mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp @@ -0,0 +1,343 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include + +#include +#include +#include + +namespace mallocMC::CreationPolicies::FlatterScatterAlloc +{ + /** + * @class PageInterpretation + * @brief Represent our interpretation of a raw data page. + * + * This class takes a reference to a raw data page and a chunk size and provides an interface to this raw memory to + * use is as a data page filled with chunks and corresponding bit masks indicating their filling. It furthermore + * provides static helper functions that implement formulae not tied to a particular piece of memory like the + * number of chunks given a chunk sizes (and the implicit page size). + * + * @param data Raw data page reference. + * @param chunkSize Chunk sizes to interpret this memory with. + */ + template + struct PageInterpretation + { + private: + DataPage& data; + uint32_t const chunkSize; + + public: + ALPAKA_FN_INLINE ALPAKA_FN_ACC PageInterpretation(DataPage& givenData, uint32_t givenChunkSize) + : data(givenData) + , chunkSize(givenChunkSize) + { + } + + /** + * @brief Compute the number of chunks of the given size that would fit onto a page. + * + * This is not quite a trivial calculation because we have to take into account the size of the bit field at + * the end which itself depends on the number of chunks. Due to the quantisation into fixed-size bit masks we + * are in the realm of integer divisions and remainders here. + * + * This is a static version of the algorithm because there's no reference to the data at all. Convenience + * version of that uses the chunk size of an instance is provided below. + * + * @param chunkSize The chunk size to use for the calculation. + * @return Number of chunks that would fit on a page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto numChunks(uint32_t const chunkSize) -> uint32_t + { + constexpr auto b = static_cast>(sizeof(BitMask)); + auto const numFull = T_pageSize / (BitMaskSize * chunkSize + b); + auto const leftOverSpace = T_pageSize - numFull * (BitMaskSize * chunkSize + b); + auto const numInRemainder = leftOverSpace > b ? (leftOverSpace - b) / chunkSize : 0U; + return numFull * BitMaskSize + numInRemainder; + } + + /** + * @brief Convenience method calling numChunks(chunkSize) with the currently set chunkSize. See there for + * details. + * + * @return Number of chunks that fit on this page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numChunks() const -> uint32_t + { + return numChunks(chunkSize); + } + + /** + * @brief Convert a chunk index into a pointer to that piece of memory. + * + * @param index Chunk index < numChunks(). + * @return Pointer to that chunk. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto chunkPointer(uint32_t index) const -> void* + { + return reinterpret_cast(&data.data[index * chunkSize]); + } + + /** + * @brief Lightweight mangling of the hash into a start point for searching in the bit field. + * + * It is important to stress that this returns an index of a bit mask, not an individual bit's index. So, if + * the BitMaskSize is 32 and I have 64 chunks on the page, there are two bit masks and the return value is + * either 0 or 1, i.e. the search would start at the 0th or 32nd bit. + * + * @param hashValue Number providing some entropy for scattering memory accesses. + * @return Index of a bit mask to start searching at. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto startBitMaskIndex(uint32_t const hashValue) const + { + return (hashValue >> 16); + } + + /** + * @brief Main allocation algorithm searching a free bit in the bit mask and returning the corresponding + * pointer to a chunk. + * + * @param hashValue Number providing some entropy for scattering memory accesses. + * @return Pointer to a valid piece of memory or nullptr if none was found. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const hashValue = 0U) -> void* + { + auto field = bitField(); + auto const index = field.firstFreeBit(acc, numChunks(), startBitMaskIndex(hashValue)); + return (index < field.noFreeBitFound()) ? chunkPointer(index) : nullptr; + } + + /** + * @brief Counterpart to create, freeing an allocated pointer's memory. + * + * In production, this does not check the validity of the pointer and providing an invalid pointer is undefined + * behaviour. This includes valid pointers to outside the range of this page, obviously. + * + * @param pointer Pointer to a piece of memory created from the create method. + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* pointer) -> void + { + if(chunkSize == 0) + { +#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + throw std::runtime_error{ + "Attempted to destroy a pointer with chunkSize==0. Likely this page was recently " + "(and potentially pre-maturely) freed."}; +#endif // NDEBUG + return; + } + auto chunkIndex = chunkNumberOf(pointer); +#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + if(not isValid(acc, chunkIndex)) + { + throw std::runtime_error{"Attempted to destroy an invalid pointer! Either the pointer does not point " + "to a valid chunk or it is not marked as allocated."}; + } +#endif // NDEBUG + bitField().unset(acc, chunkIndex); + } + + /** + * @brief Convenience method to retrieve the configured minimal chunk size. + * + * @return Minimal possible chunk size of the page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto minimalChunkSize() -> uint32_t + { + return T_minimalChunkSize; + } + + /** + * @brief Clean up the full bit field region. + * + * This method is supposed to be used on raw memory and cleans up the maximal possible bit field region without + * assuming anything about its previous content. It is supposed to be used during initialisation of raw memory + * and after leaving a page in multi-page mode when arbitrary data is potentially found in that region. There + * is a further optimised version of clean-up for cases where this page was in use in chunked mode before. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanupFull() -> void + { + PageInterpretation(data, minimalChunkSize()).resetBitField(); + } + + /** + * @brief Clean up previously unused parts of the bit field region. + * + * This method is supposed to have the same effect as cleanupFull but only on pages that are already in use in + * chunked mode. Due to this additional assumption we can conclude that the part that currently acts as bit + * field is already nulled (because we're the last ones on the page about to clean up, so all bits are unset). + * This significantly reduces the size of the region that needs cleaning if a small chunk size was set + * previously. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto cleanupUnused() -> void + { + auto worstCasePage = PageInterpretation(data, minimalChunkSize()); + memset( + static_cast(worstCasePage.bitFieldStart()), + 0U, + worstCasePage.bitFieldSize() - bitFieldSize()); + } + + /** + * @brief Reset the currently used bit field to 0. + * + * This was introduced to be called on pages interpreted with the minimal chunk size to fully clean up the bit + * field region. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto resetBitField() -> void + { + // This method is not thread-safe by itself. But it is supposed to be called after acquiring a "lock" in + // the form of setting the filling level, so that's fine. + + memset(static_cast(bitFieldStart()), 0U, bitFieldSize()); + } + + /** + * @brief Checks if a pointer points to an allocated chunk of memory on this page. + * + * This is not used in production and is not thread-safe in the sense that the information is stale as soon as + * it's returned. It is used in debug mode and can be used for (single-threaded) tests. + * + * @param pointer The pointer in question. + * @return true if the pointer points to an allocated chunk of memory, false otherwise + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, void* const pointer) const -> bool + { + // This function is neither thread-safe nor particularly performant. It is supposed to be used in tests and + // debug mode. + return isValid(acc, chunkNumberOf(pointer)); + } + + private: + /** + * @brief Helper method for isValid(pointer) that acts on the level of the chunk's index which translates to + * the bit field position easier than the pointer. + * + * @param chunkIndex Index to a chunk to check. + * @return true if the chunk with this index is allocated, false otherwise + */ + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isValid(TAcc const& acc, int32_t const chunkIndex) const -> bool + { + return chunkIndex >= 0 and chunkIndex < static_cast(numChunks()) and isAllocated(acc, chunkIndex); + } + + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto isAllocated(TAcc const& acc, uint32_t const chunkIndex) const -> bool + { + return bitField().get(acc, chunkIndex); + } + + public: + /** + * @brief Return the bit field of this page. + * + * @return Bit field of this page. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitField() const -> BitFieldFlat + { + return BitFieldFlat{{bitFieldStart(), ceilingDivision(numChunks(), BitMaskSize)}}; + } + + /** + * @brief Return a pointer to the first bit mask. + * + * @return Pointer to the first bit mask. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitFieldStart() const -> BitMask* + { + return reinterpret_cast(&data.data[T_pageSize - bitFieldSize()]); + } + + /** + * @brief Convenience method to compute the bit field size of the current page. Forwards to its static version. + * See there for details. + * + * @return Size of this pages bit field in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto bitFieldSize() const -> uint32_t + { + return bitFieldSize(chunkSize); + } + + /** + * @brief Compute the size of the bit field region in number of bytes for a page with the given chunk size. + * + * There is an instance method using the instance's chunk size for convenience. + * + * @param chunkSize Chunk size of the would-be page. + * @return Size of this pages bit field in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto bitFieldSize(uint32_t const chunkSize) -> uint32_t + { + return sizeof(BitMask) * ceilingDivision(numChunks(chunkSize), BitMaskSize); + } + + /** + * @brief Commpute the maximal possible size of the bit field in number of bytes. + * + * This is practically the bit field size of an instance with the minimaalChunkSize(). + * + * @return Maximal possible size of the bit field in number of bytes. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto maxBitFieldSize() -> uint32_t + { + return PageInterpretation::bitFieldSize(minimalChunkSize()); + } + + /** + * @brief Compute a chunk index given a pointer. + * + * Please note that this will return invalid indices for invalid input pointers. Be sure to guard against this + * if you don't want to risk messing up your memory. + * + * @param pointer A pointer interpreted to be pointing to a chunk of the current page. + * @return A valid index to a chunk on this page if the pointer was valid. A potentially negative number + * outside the valid range of chunk indices otherwise. + */ + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto chunkNumberOf(void* pointer) const -> int32_t + { + return indexOf(pointer, &data, chunkSize); + } + + // these are supposed to be temporary objects, don't start messing around with them: + PageInterpretation(PageInterpretation const&) = delete; + PageInterpretation(PageInterpretation&&) = delete; + auto operator=(PageInterpretation const&) -> PageInterpretation& = delete; + auto operator=(PageInterpretation&&) -> PageInterpretation& = delete; + ~PageInterpretation() = default; + }; +} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc diff --git a/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp b/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp new file mode 100644 index 00000000..d040bc12 --- /dev/null +++ b/src/include/mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp @@ -0,0 +1,73 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + + +#include + +#include + +/** + * @brief Abstraction of a short-circuiting loop that wraps around from an arbitrary starting point within the range. + * + * This implements a re-occuring pattern in the code: Due to the scattering approach taken, we're often in a position + * where we want to run a simple loop except for the fact that we start in an arbitrary position within the range and + * complete it by wrapping around to the start of the range continuing from there. Furthermore, these loops are all + * searches, so it's advantageous to implement short-circuiting by early exit in case of finding another value than the + * provided failureValue. + * + * @tparam T_size Type of size-like arguments. This function is used in various contexts where this can either be + * size_t or uint32_t. + * @tparam TFunctor Type of the function representing the loop body (typically a lambda function). + * @tparam TArgs Types of additional arguments provided to the function. + * @param startIndex Index to start the loop at. + * @param size Size of the range which equals the number of iterations to be performed in total. + * @param failureValue Return value of the function indicating a failure of the current iteration and triggering the + * next iteration. + * @param func Function of type TFunctor representing the loop body. It is supposed to return a value of + * decltype(failureValue) and indicate failure by returning the latter. Any other value is interpreted as success + * triggering early exit of the loop. + * @param args Additional arguments to be provided to the function on each iteration. + * @return The return value of func which might be failureValue in case all iterations failed. + */ +template +ALPAKA_FN_INLINE ALPAKA_FN_ACC auto wrappingLoop( + TAcc const& acc, + T_size const startIndex, + T_size const size, + auto failureValue, + TFunctor func, + TArgs... args) +{ + for(uint32_t i = 0; i < size; ++i) + { + auto result = func(acc, (i + startIndex) % size, args...); + if(result != failureValue) + { + return result; + } + } + return failureValue; +} diff --git a/src/include/mallocMC/mallocMC.hpp b/src/include/mallocMC/mallocMC.hpp index e511f848..92469394 100644 --- a/src/include/mallocMC/mallocMC.hpp +++ b/src/include/mallocMC/mallocMC.hpp @@ -45,6 +45,7 @@ // all the policies #include "alignmentPolicies/Noop.hpp" #include "alignmentPolicies/Shrink.hpp" +#include "creationPolicies/FlatterScatter.hpp" #include "creationPolicies/OldMalloc.hpp" #include "creationPolicies/Scatter.hpp" #include "distributionPolicies/Noop.hpp" From 4d0b90b6fc610bdc26521fcea722d3acdc0c616d Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Tue, 12 Nov 2024 14:47:59 +0100 Subject: [PATCH 08/16] Update LICENSE, README and Usage --- LICENSE | 63 +++++++++++++++++++++++++++++-------------------------- README.md | 44 +++++++++++++++++++------------------- Usage.md | 26 ++++++++++++++++------- 3 files changed, 72 insertions(+), 61 deletions(-) diff --git a/LICENSE b/LICENSE index 400d0261..7c7870ae 100644 --- a/LICENSE +++ b/LICENSE @@ -1,37 +1,40 @@ -mallocMC: Memory Allocation for Many Core Architectures +/* + mallocMC: Memory Allocation for Many Core Architectures - based on the work of ScatterAlloc: - Massively Parallel Dynamic Memory Allocation for the GPU + based on the work of ScatterAlloc: + Massively Parallel Dynamic Memory Allocation for the GPU -http://www.icg.tugraz.at/project/mvp -https://www.hzdr.de/crp + http://www.icg.tugraz.at/project/mvp + https://www.hzdr.de/crp -Copyright (C) 2012 Institute for Computer Graphics and Vision, - Graz University of Technology -Copyright (C) 2014-2015 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf -Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at - Bernhard Kainz - kainz ( at ) icg.tugraz.at - Michael Kenzel - kenzel ( at ) icg.tugraz.at - Rene Widera - r.widera ( at ) hzdr.de - Axel Huebl - a.huebl ( at ) hzdr.de - Carlchristian Eckert - c.eckert ( at ) hzdr.de + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Bernhard Kainz - kainz ( at ) icg.tugraz.at + Michael Kenzel - kenzel ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ diff --git a/README.md b/README.md index b53b87f5..b99fa52e 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,9 @@ mallocMC: *Memory Allocator for Many Core Architectures* This project provides a framework for **fast memory managers** on **many core accelerators**. It is based on [alpaka](https://github.com/alpaka-group/alpaka) -to run on many different accelerators and implements the *ScatterAlloc* algorithm. - +to run on many different accelerators and comes with multiple allocation +algorithms out-of-the-box. Custom ones can be added easily due to the +policy-based design. Usage ------- @@ -14,30 +15,31 @@ Usage Follow the step-by-step instructions in [Usage.md](Usage.md) to replace your `new`/`malloc` calls with a *blacingly fast* mallocMC heap! :rocket: - Install ------- mallocMC is header-only, but requires a few other C++ libraries to be available. Our installation notes can be found in [INSTALL.md](INSTALL.md). - Contributing ------------ -Rules for contributions are found in [CONTRIBUTING.md](CONTRIBUTING.md). +Rules for contributions are found in [CONTRIBUTING.md](./CONTRIBUTING.md). -On the ScatterAlloc Algorithm +On the Algorithms ----------------------------- -This library implements the *ScatterAlloc* algorithm, originally +This library was originally inspired by the *ScatterAlloc* algorithm, [forked](https://en.wikipedia.org/wiki/Fork_%28software_development%29) from the **ScatterAlloc** project, developed by the [Managed Volume Processing](http://www.icg.tugraz.at/project/mvp) group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at), -TU Graz (kudos!). +TU Graz (kudos!). The currently shipped algorithms are using similar ideas but +differ from the original one significantly. + +From the original project page (which is no longer existent to the best of our +knowledge): -From http://www.icg.tugraz.at/project/mvp/downloads : ```quote ScatterAlloc is a dynamic memory allocator for the GPU. It is designed concerning the requirements of massively parallel @@ -51,21 +53,18 @@ execution time is almost independent of the thread count. ScatterAlloc is open source and easy to use in your CUDA projects. ``` -Original Homepage: http://www.icg.tugraz.at/project/mvp - -Our Homepage: https://www.hzdr.de/crp - - -Branches --------- - -| *branch* | *state* | *description* | -| ----------- | ------- | ----------------------- | -| **master** | [![Build Status Master](https://travis-ci.org/alpaka-group/mallocMC.png?branch=master)](https://travis-ci.org/alpaka-group/mallocMC "master") | our latest stable release | -| **dev** | [![Build Status Development](https://travis-ci.org/alpaka-group/mallocMC.png?branch=dev)](https://travis-ci.org/alpaka-group/mallocMC "dev") | our development branch - start and merge new branches here | -| **tugraz** | n/a | *ScatterAlloc* "upstream" branch: not backwards compatible mirror for algorithmic changes | +Our Homepage: +Versions and Releases +--------------------- +Official releases can be found in the +[Github releases](https://github.com/alpaka-group/mallocMC/releases). +We try to stick to [semantic versioning](https://semver.org/) but we'll bump +the major version number for major features. +Development happens on the `dev` branch. +Changes there have passed the CI and a code review but we make no guarantees +about API or feature stability in this branch. Literature ---------- @@ -81,7 +80,6 @@ Just an incomplete link collection for now: - Junior Thesis [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34461.svg)](http://dx.doi.org/10.5281/zenodo.34461) by Carlchristian Eckert (2014) - License ------- diff --git a/Usage.md b/Usage.md index 3f8049ea..45963ee0 100644 --- a/Usage.md +++ b/Usage.md @@ -13,21 +13,23 @@ There is one header file that will include *all* necessary files: Step 2a: choose policies ----------------------- -Each instance of a policy based allocator is composed through 5 **policies**. Each policy is expressed as a **policy class**. +Each instance of a policy based allocator is composed through 5 **policies**. +Each policy is expressed as a **policy class**. Currently, there are the following policy classes available: |Policy | Policy Classes (implementations) | description | |------- |----------------------------------| ----------- | -|**CreationPolicy** | Scatter`` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters| -| | OldMalloc | device-side malloc/new and free/delete syscalls as implemented on NVidia CUDA graphics cards with compute capability sm_20 and higher | -|**DistributionPolicy** | XMallocSIMD`` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match | +|**CreationPolicy** | Scatter`` | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters| +| | FlatterScatter`` | Another scattered allocation algorithm similar in spirit to `Scatter` but with a flatter hierarchy and stronger concurrency invariants. `conf1` and `conf2` act as before. +| | OldMalloc | Device-side malloc/new and free/delete syscalls as implemented on the given device. +|**DistributionPolicy** | XMallocSIMD`` | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match | | | Noop | no workload distribution at all | |**OOMPolicy** | ReturnNull | pointers will be *nullptr*, if the request could not be fulfilled | | | ~~BadAllocException~~ | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions | -|**ReservePoolPolicy** | SimpleCudaMalloc | allocate a fixed heap with `CudaMalloc` | +|**ReservePoolPolicy** | AlpakaBuf | Allocate a fixed-size buffer in an `alpaka`-provided container. | | | CudaSetLimits | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) | -|**AlignmentPolicy** | Shrink`` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment| +|**AlignmentPolicy** | Shrink`` | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment| | | Noop | no alignment at all | The user has to choose one of each policy that will form a useful allocator @@ -51,6 +53,7 @@ struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties { Step 2c: combine policies ------------------------- + After configuring the chosen policies, they can be used as template parameters to create the desired allocator type: @@ -86,7 +89,6 @@ Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated with template arguments to use the default configuration. `Shrink` however uses the configuration struct defined above. - Step 3: instantiate allocator ----------------------------- @@ -100,8 +102,14 @@ The allocator object offers the following methods | Name | description | |---------------------- |-------------------------| +| getAllocatorHandle() | Acquire a handle from the allocator that can be used in kernels to allocate memory on device. | getAvailableSlots(size_t) | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`) | +One should note that on a running system with multiple threads manipulating +memory the information provided by `getAvailableSlots` is stale the moment it's +acquired and so relying on this information to be accurate is not recommended. +It is supposed to be used in initialisation/finalisation phases without dynamic +memory allocations or in tests. Step 4: use dynamic memory allocation in a kernel ------------------------------------------------- @@ -114,9 +122,11 @@ The handle offers the following methods: |---------------------- |-------------------------| | malloc(size_t) | Allocates memory on the accelerator | | free(size_t) | Frees memory on the accelerator | -| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`) | +| getAvailableSlots() | Determines number of allocatable slots of a certain size. This only works, if the chosen CreationPolicy supports it (can be found through `mallocMC::Traits::providesAvailableSlots`).| +The comments on `getAvailableSlots` from above hold all the same. A simplistic example would look like this: + ```c++ #include From 8485629cf21f29e4f3320cebf316da5bac401f1f Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Tue, 12 Nov 2024 14:49:18 +0100 Subject: [PATCH 09/16] Update examples --- examples/mallocMC_example01.cpp | 118 ++++++++++++++++++-------------- examples/mallocMC_example03.cpp | 87 +++++++++++++---------- 2 files changed, 119 insertions(+), 86 deletions(-) diff --git a/examples/mallocMC_example01.cpp b/examples/mallocMC_example01.cpp index c991f357..f49c1696 100644 --- a/examples/mallocMC_example01.cpp +++ b/examples/mallocMC_example01.cpp @@ -26,6 +26,9 @@ THE SOFTWARE. */ +#include "mallocMC/creationPolicies/FlatterScatter.hpp" +#include "mallocMC/creationPolicies/OldMalloc.hpp" + #include #include @@ -33,35 +36,38 @@ #include #include +#include #include #include +using mallocMC::CreationPolicies::FlatterScatter; +using mallocMC::CreationPolicies::OldMalloc; +using mallocMC::CreationPolicies::Scatter; + using Dim = alpaka::DimInt<1>; using Idx = std::size_t; // Define the device accelerator using Acc = alpaka::ExampleDefaultAcc; -struct ScatterHeapConfig -{ - static constexpr auto pagesize = 4096; - static constexpr auto accessblocksize = 512u * 1024u * 1024u; - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = 2; - static constexpr auto resetfreedpages = false; -}; +constexpr uint32_t const blocksize = 2U * 1024U * 1024U; +constexpr uint32_t const pagesize = 4U * 1024U; +constexpr uint32_t const wasteFactor = 1U; -struct ScatterHashConfig +// This happens to also work for the original Scatter algorithm, so we only define one. +struct FlatterScatterHeapConfig : FlatterScatter<>::Properties::HeapConfig { - static constexpr auto hashingK = 38183; - static constexpr auto hashingDistMP = 17497; - static constexpr auto hashingDistWP = 1; - static constexpr auto hashingDistWPRel = 1; + static constexpr auto accessblocksize = blocksize; + static constexpr auto pagesize = ::pagesize; + static constexpr auto heapsize = 2U * 1024U * 1024U * 1024U; + // Only used by original Scatter (but it doesn't hurt FlatterScatter to keep): + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = wasteFactor; }; struct XMallocConfig { - static constexpr auto pagesize = ScatterHeapConfig::pagesize; + static constexpr auto pagesize = FlatterScatterHeapConfig::pagesize; }; struct ShrinkConfig @@ -69,20 +75,21 @@ struct ShrinkConfig static constexpr auto dataAlignment = 16; }; -using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA; ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB; ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC; -auto main() -> int +template +auto example01() -> int { + using Allocator = mallocMC::Allocator< + Acc, + T_CreationPolicy, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::ReservePoolPolicies::AlpakaBuf, + mallocMC::AlignmentPolicies::Shrink>; + constexpr auto length = 100; auto const platform = alpaka::Platform{}; @@ -90,26 +97,27 @@ auto main() -> int auto queue = alpaka::Queue{dev}; auto const devProps = alpaka::getAccDevProps(dev); - unsigned const block = std::min(static_cast(32u), static_cast(devProps.m_blockThreadCountMax)); + unsigned const block = std::min(static_cast(32U), static_cast(devProps.m_blockThreadCountMax)); // round up - auto grid = (length + block - 1u) / block; + auto grid = (length + block - 1U) / block; assert(length <= block * grid); // necessary for used algorithm // init the heap std::cerr << "initHeap..."; - ScatterAllocator scatterAlloc(dev, queue, 1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc + auto const heapSize = 2U * 1024U * 1024U * 1024U; + Allocator scatterAlloc(dev, queue, heapSize); // 1GB for device-side malloc std::cerr << "done\n"; - std::cout << ScatterAllocator::info("\n") << '\n'; + std::cout << Allocator::info("\n") << '\n'; // create arrays of arrays on the device { auto createArrayPointers - = [] ALPAKA_FN_ACC(Acc const& acc, int x, int y, ScatterAllocator::AllocatorHandle allocHandle) + = [] ALPAKA_FN_ACC(Acc const& acc, int x, int y, Allocator::AllocatorHandle allocHandle) { - arA = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y); - arB = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y); - arC = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y); + arA = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); + arB = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); + arC = static_cast(allocHandle.malloc(acc, sizeof(int*) * x * y)); }; auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; alpaka::enqueue( @@ -124,18 +132,18 @@ auto main() -> int // fill 2 of them all with ascending values { - auto fillArrays = [] ALPAKA_FN_ACC(Acc const& acc, int length, ScatterAllocator::AllocatorHandle allocHandle) + auto fillArrays = [] ALPAKA_FN_ACC(Acc const& acc, int localLength, Allocator::AllocatorHandle allocHandle) { auto const id = alpaka::getIdx(acc)[0]; - arA[id] = (int*) allocHandle.malloc(acc, length * sizeof(int)); - arB[id] = (int*) allocHandle.malloc(acc, length * sizeof(int)); - arC[id] = (int*) allocHandle.malloc(acc, length * sizeof(int)); + arA[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); + arB[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); + arC[id] = static_cast(allocHandle.malloc(acc, localLength * sizeof(int))); - for(int i = 0; i < length; ++i) + for(int i = 0; i < localLength; ++i) { - arA[id][i] = static_cast(id * length + i); - arB[id][i] = static_cast(id * length + i); + arA[id][i] = static_cast(id * localLength + i); + arB[id][i] = static_cast(id * localLength + i); } }; auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; @@ -149,15 +157,15 @@ auto main() -> int { auto sumsBufferAcc = alpaka::allocBuf(dev, Idx{block * grid}); - auto addArrays = [] ALPAKA_FN_ACC(Acc const& acc, int length, int* sums) + auto addArrays = [] ALPAKA_FN_ACC(Acc const& acc, int localLength, int* sums) { auto const id = alpaka::getIdx(acc)[0]; sums[id] = 0; - for(int i = 0; i < length; ++i) + for(int i = 0; i < localLength; ++i) { - arC[id][i] = arA[id][i] + arB[id][i]; - sums[id] += arC[id][i]; + arC[id][i] = arA[id][i] + arB[id][i]; + sums[id] += arC[id][i]; } }; auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; @@ -181,7 +189,7 @@ auto main() -> int auto const gaussian = n * (n - 1); std::cout << "The gaussian sum as comparison: " << gaussian << '\n'; - /*constexpr*/ if(mallocMC::Traits::providesAvailableSlots) + /*constexpr*/ if(mallocMC::Traits::providesAvailableSlots) { std::cout << "there are "; std::cout << scatterAlloc.getAvailableSlots(dev, queue, 1024U * 1024U); @@ -189,23 +197,23 @@ auto main() -> int } { - auto freeArrays = [] ALPAKA_FN_ACC(Acc const& acc, ScatterAllocator::AllocatorHandle allocHandle) + auto freeArrays = [] ALPAKA_FN_ACC(Acc const& acc, Allocator::AllocatorHandle allocHandle) { auto const id = alpaka::getIdx(acc)[0]; - allocHandle.free(acc, arA[id]); - allocHandle.free(acc, arB[id]); - allocHandle.free(acc, arC[id]); + allocHandle.free(acc, arA[id]); + allocHandle.free(acc, arB[id]); + allocHandle.free(acc, arC[id]); }; auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, freeArrays, scatterAlloc.getAllocatorHandle())); } { - auto freeArrayPointers = [] ALPAKA_FN_ACC(Acc const& acc, ScatterAllocator::AllocatorHandle allocHandle) + auto freeArrayPointers = [] ALPAKA_FN_ACC(Acc const& acc, Allocator::AllocatorHandle allocHandle) { - allocHandle.free(acc, arA); - allocHandle.free(acc, arB); - allocHandle.free(acc, arC); + allocHandle.free(acc, arA); + allocHandle.free(acc, arB); + allocHandle.free(acc, arC); }; auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; alpaka::enqueue( @@ -215,3 +223,11 @@ auto main() -> int return 0; } + +auto main(int /*argc*/, char* /*argv*/[]) -> int +{ + example01>(); + example01>(); + example01(); + return 0; +} diff --git a/examples/mallocMC_example03.cpp b/examples/mallocMC_example03.cpp index 6551e0d7..e9ce2d4b 100644 --- a/examples/mallocMC_example03.cpp +++ b/examples/mallocMC_example03.cpp @@ -26,16 +26,19 @@ THE SOFTWARE. */ +#include "mallocMC/creationPolicies/OldMalloc.hpp" + #include #include #include #include -#include #include -#include -#include + +using mallocMC::CreationPolicies::FlatterScatter; +using mallocMC::CreationPolicies::OldMalloc; +using mallocMC::CreationPolicies::Scatter; using Dim = alpaka::DimInt<1>; using Idx = std::size_t; @@ -43,21 +46,19 @@ using Idx = std::size_t; // Define the device accelerator using Acc = alpaka::ExampleDefaultAcc; -struct ScatterConfig -{ - static constexpr auto pagesize = 4096; - static constexpr auto accessblocksize = 512u * 1024u * 1024u; - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = 2; - static constexpr auto resetfreedpages = false; -}; +constexpr uint32_t const blocksize = 2U * 1024U * 1024U; +constexpr uint32_t const pagesize = 4U * 1024U; +constexpr uint32_t const wasteFactor = 1U; -struct ScatterHashParams +// This happens to also work for the original Scatter algorithm, so we only define one. +struct FlatterScatterHeapConfig : FlatterScatter<>::Properties::HeapConfig { - static constexpr auto hashingK = 38183; - static constexpr auto hashingDistMP = 17497; - static constexpr auto hashingDistWP = 1; - static constexpr auto hashingDistWPRel = 1; + static constexpr auto accessblocksize = blocksize; + static constexpr auto pagesize = ::pagesize; + static constexpr auto heapsize = 2U * 1024U * 1024U * 1024U; + // Only used by original Scatter (but it doesn't hurt FlatterScatter to keep): + static constexpr auto regionsize = 16; + static constexpr auto wastefactor = wasteFactor; }; struct AlignmentConfig @@ -65,55 +66,71 @@ struct AlignmentConfig static constexpr auto dataAlignment = 16; }; -using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - ALPAKA_STATIC_ACC_MEM_GLOBAL int* arA = nullptr; +template struct ExampleKernel { - ALPAKA_FN_ACC void operator()(Acc const& acc, ScatterAllocator::AllocatorHandle allocHandle) const + ALPAKA_FN_ACC void operator()(Acc const& acc, T_Allocator::AllocatorHandle allocHandle) const { auto const id = static_cast(alpaka::getIdx(acc)[0]); if(id == 0) - arA = (int*) allocHandle.malloc(acc, sizeof(int) * 32); + { + arA = static_cast(allocHandle.malloc(acc, sizeof(int) * 32U)); + } // wait the the malloc from thread zero is not changing the result for some threads alpaka::syncBlockThreads(acc); auto const slots = allocHandle.getAvailableSlots(acc, 1); - if(arA != nullptr) + if(arA != nullptr) { - arA[id] = id; - printf("id: %u array: %d slots %u\n", id, arA[id], slots); + arA[id] = id; + printf("id: %u array: %d slots %u\n", id, arA[id], slots); } else printf("error: device size allocation failed"); - // wait that all thread read from `arA` + // wait that all thread read from `arA` alpaka::syncBlockThreads(acc); if(id == 0) - allocHandle.free(acc, arA); + { + allocHandle.free(acc, arA); + } } }; -auto main() -> int +template +auto example03() -> int { + using Allocator = mallocMC::Allocator< + Acc, + T_CreationPolicy, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::ReservePoolPolicies::AlpakaBuf, + mallocMC::AlignmentPolicies::Shrink>; + auto const platform = alpaka::Platform{}; auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; auto const devProps = alpaka::getAccDevProps(dev); - unsigned const block = std::min(static_cast(32u), static_cast(devProps.m_blockThreadCountMax)); + unsigned const block = std::min(static_cast(32U), static_cast(devProps.m_blockThreadCountMax)); - ScatterAllocator scatterAlloc(dev, queue, 1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc + Allocator scatterAlloc(dev, queue, 2U * 1024U * 1024U * 1024U); // 2GB for device-side malloc auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{block}, Idx{1}}; - alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, ExampleKernel{}, scatterAlloc.getAllocatorHandle())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDiv, ExampleKernel{}, scatterAlloc.getAllocatorHandle())); std::cout << "Slots from Host: " << scatterAlloc.getAvailableSlots(dev, queue, 1) << '\n'; return 0; } + +auto main(int /*argc*/, char* /*argv*/[]) -> int +{ + example03>(); + example03>(); + example03(); + return 0; +} From 8115fc93c68c2923fb1f3dfcd95a6fccc1ea53ca Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Tue, 12 Nov 2024 14:50:18 +0100 Subject: [PATCH 10/16] Update tests --- .github/workflows/ci.yml | 34 +- CMakeLists.txt | 39 +- tests/dimensions.cpp | 413 ------------- tests/main.cpp | 2 - tests/policies.cpp | 189 ------ tests/thread-safety/AccessBlock.cpp | 927 ++++++++++++++++++++++++++++ tests/thread-safety/BitField.cpp | 92 +++ tests/thread-safety/Scatter.cpp | 859 ++++++++++++++++++++++++++ tests/unit/AccessBlock.cpp | 532 ++++++++++++++++ tests/unit/BitField.cpp | 247 ++++++++ tests/unit/PageInterpretation.cpp | 316 ++++++++++ tests/unit/PageTable.cpp | 54 ++ tests/unit/mocks.hpp | 76 +++ tests/verify_heap.cpp | 734 ---------------------- tests/verify_heap_config.hpp | 80 --- 15 files changed, 3157 insertions(+), 1437 deletions(-) delete mode 100644 tests/dimensions.cpp delete mode 100644 tests/main.cpp delete mode 100644 tests/policies.cpp create mode 100644 tests/thread-safety/AccessBlock.cpp create mode 100644 tests/thread-safety/BitField.cpp create mode 100644 tests/thread-safety/Scatter.cpp create mode 100644 tests/unit/AccessBlock.cpp create mode 100644 tests/unit/BitField.cpp create mode 100644 tests/unit/PageInterpretation.cpp create mode 100644 tests/unit/PageTable.cpp create mode 100644 tests/unit/mocks.hpp delete mode 100644 tests/verify_heap.cpp delete mode 100644 tests/verify_heap_config.hpp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7a277f5..57046bed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,10 +1,7 @@ -name: pre-commit -on: - pull_request: - push: - branches: [main, test-me-*] +name: Continuous Integration +on: [push, pull_request] jobs: - main: + pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -14,3 +11,28 @@ jobs: - uses: pre-commit/action@v3.0.1 - uses: pre-commit-ci/lite-action@v1.0.2 if: always() + cpu-tests: + # This action only runs on various CPU backends. + # As such, this is not a fully-fletched production-like test. + # Hopefully, it will still save us from a few stupid mistakes. + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: sudo apt update && sudo apt install libboost-all-dev + - run: mkdir build_dir + - working-directory: build_dir + run: | + cmake .. \ + -DCMAKE_CXX_FLAGS="-std=c++20 -g" \ + -Dalpaka_CXX_STANDARD=20 \ + -DmallocMC_CATCH2_PROVIDER=extern \ + -Dalpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE:BOOL=ON \ + -Dalpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE:BOOL=ON \ + -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE:BOOL=ON \ + -Dalpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE:BOOL=ON + - working-directory: build_dir + run: make -j tests examples + - working-directory: build_dir + run: ./tests + - working-directory: build_dir + run: ./mallocMC_Example01 && ./mallocMC_Example03 diff --git a/CMakeLists.txt b/CMakeLists.txt index 592772f4..34d2926e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,12 +28,6 @@ endif() set(mallocMC_CATCH2_PROVIDER "intern" CACHE STRING "Select which Catch2 is used") set_property(CACHE mallocMC_CATCH2_PROVIDER PROPERTY STRINGS "intern;extern") mark_as_advanced(mallocMC_CATCH2_PROVIDER) -if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern") - add_library(Catch2::Catch2 INTERFACE IMPORTED) - target_include_directories(Catch2::Catch2 INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thirdParty/catch2/include) -else() - find_package(Catch2 CONFIG REQUIRED) -endif() # for installation, just copy include folder to install folder install( @@ -64,13 +58,32 @@ alpaka_add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_exam target_include_directories(mallocMC_Example03 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) target_link_libraries(mallocMC_Example03 PUBLIC alpaka::alpaka warnings) -alpaka_add_executable(VerifyHeap EXCLUDE_FROM_ALL tests/verify_heap.cpp tests/verify_heap_config.hpp) -target_include_directories(VerifyHeap PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) -target_link_libraries(VerifyHeap PUBLIC alpaka::alpaka warnings) +add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03) -alpaka_add_executable(tests EXCLUDE_FROM_ALL tests/main.cpp tests/dimensions.cpp tests/policies.cpp) -target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) -target_link_libraries(tests PUBLIC alpaka::alpaka Catch2::Catch2 warnings) +if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern") + find_package(Catch2 3.6.0 REQUIRED) + include(Catch) +else() + # get Catch2 v3 and build it from source with the same C++ standard as the tests + Include(FetchContent) + FetchContent_Declare(Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git GIT_TAG v3.6.0) + FetchContent_MakeAvailable(Catch2) + target_compile_features(Catch2 PUBLIC cxx_std_17) + include(Catch) + # hide Catch2 cmake variables by default in cmake gui + get_cmake_property(variables VARIABLES) + foreach (var ${variables}) + if (var MATCHES "^CATCH_") + mark_as_advanced(${var}) + endif() + endforeach() +endif() -add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03 VerifyHeap) +file(GLOB_RECURSE testSources "${CMAKE_CURRENT_SOURCE_DIR}/tests/*/*.cpp") +alpaka_add_executable(tests EXCLUDE_FROM_ALL ${testSources}) +catch_discover_tests(tests) +source_group(TREE "${CMAKE_CURRENT_LIST_DIR}/tests" FILES ${testSources}) +target_compile_features(tests PRIVATE cxx_std_17) +target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include) +target_link_libraries(tests PRIVATE alpaka::alpaka Catch2::Catch2WithMain) diff --git a/tests/dimensions.cpp b/tests/dimensions.cpp deleted file mode 100644 index a39ff448..00000000 --- a/tests/dimensions.cpp +++ /dev/null @@ -1,413 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2020 Helmholtz-Zentrum Dresden - Rossendorf, - CERN - - Author(s): Bernhard Manfred Gruber - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include - -#include -#include - -using Idx = std::size_t; - -struct ScatterConfig -{ - static constexpr auto pagesize = 4096; - static constexpr auto accessblocksize = 256u * 1024u; - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = 2; - static constexpr auto resetfreedpages = false; -}; - -struct ScatterHashParams -{ - static constexpr auto hashingK = 38183; - static constexpr auto hashingDistMP = 17497; - static constexpr auto hashingDistWP = 1; - static constexpr auto hashingDistWPRel = 1; -}; - -struct DistributionConfig -{ - static constexpr auto pagesize = ScatterConfig::pagesize; -}; - -struct AlignmentConfig -{ - static constexpr auto dataAlignment = 16; -}; - -ALPAKA_STATIC_ACC_MEM_GLOBAL int** deviceArray; - -template typename AccTemplate> -void test1D() -{ - using Dim = alpaka::DimInt<1>; - using Acc = AccTemplate; - - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - // mallocMC::CreationPolicies::OldMalloc, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - // mallocMC::ReservePoolPolicies::CudaSetLimits, - mallocMC::AlignmentPolicies::Shrink>; - - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto queue = alpaka::Queue{dev}; - - constexpr auto N = 16; - static_assert(N <= mallocMC::maxThreadsPerBlock, ""); - - ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB - - // make 1 allocation from 1 thread for N * N pointers - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * dim * dim); }, - N, - scatterAlloc.getAllocatorHandle())); - - // make N * N allocations from N block of N threads for ints - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{Idx{N}, Idx{N}, Idx{1}}, - [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto const i = alpaka::getIdx(acc)[0]; - deviceArray[i] = (int*) allocHandle.malloc(acc, sizeof(int)); - }, - scatterAlloc.getAllocatorHandle())); - - auto const slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); - auto const heapInfo = scatterAlloc.getHeapLocations().at(0); - std::cout << alpaka::trait::GetAccName::getAccName() << " slots: " << slots << " heap size: " << heapInfo.size - << '\n'; - - // free N * N allocations from N block of N threads for ints - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{Idx{N}, Idx{N}, Idx{1}}, - [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto const i = alpaka::getIdx(acc)[0]; - allocHandle.free(acc, deviceArray[i]); - }, - scatterAlloc.getAllocatorHandle())); - - // free 1 allocation from 1 thread for N * N pointers - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}, - [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) - { allocHandle.free(acc, deviceArray); }, - scatterAlloc.getAllocatorHandle())); -} - -template typename AccTemplate> -void test2D() -{ - using Dim = alpaka::DimInt<2>; - using Acc = AccTemplate; - - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto queue = alpaka::Queue{dev}; - - constexpr auto N = 8; - static_assert(N * N <= mallocMC::maxThreadsPerBlock, ""); - - ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB - - // make 1 allocation from 1 thread for N*N * N*N pointers - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(1), - alpaka::Vec::all(1), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * dim * dim * dim * dim); }, - N, - scatterAlloc.getAllocatorHandle())); - - // make N*N * N*N allocations from N*N block of N*N threads for ints - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(N), - alpaka::Vec::all(N), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto const idx = alpaka::getIdx(acc); - deviceArray[idx[0] * dim * dim + idx[1]] = (int*) allocHandle.malloc(acc, sizeof(int)); - }, - N, - scatterAlloc.getAllocatorHandle())); - - auto const slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); - auto const heapInfo = scatterAlloc.getHeapLocations().at(0); - std::cout << alpaka::trait::GetAccName::getAccName() << " slots: " << slots << " heap size: " << heapInfo.size - << '\n'; - - // free N*N * N*N allocations from N*N block of N*N threads for ints - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(N), - alpaka::Vec::all(N), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto const idx = alpaka::getIdx(acc); - allocHandle.free(acc, deviceArray[idx[0] * dim * dim + idx[1]]); - }, - N, - scatterAlloc.getAllocatorHandle())); - - // free 1 allocation from 1 thread for N*N * N*N pointers - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(1), - alpaka::Vec::all(1), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) - { allocHandle.free(acc, deviceArray); }, - scatterAlloc.getAllocatorHandle())); -} - -template typename AccTemplate> -void test3D() -{ - using Dim = alpaka::DimInt<3>; - using Acc = AccTemplate; - - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto queue = alpaka::Queue{dev}; - - constexpr auto N = 4; - static_assert(N * N * N <= mallocMC::maxThreadsPerBlock, ""); - - ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB - - // make 1 allocation from 1 thread for N*N*N * N*N*N pointers - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(1), - alpaka::Vec::all(1), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * dim * dim * dim * dim * dim * dim); }, - N, - scatterAlloc.getAllocatorHandle())); - - // make N*N*N * N*N*N allocations from N*N*N blocks of N*N*N threads for - // ints - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(N), - alpaka::Vec::all(N), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto const idx = alpaka::getIdx(acc); - deviceArray[idx[0] * dim * dim * dim * dim + idx[1] * dim * dim + idx[0]] - = (int*) allocHandle.malloc(acc, sizeof(int)); - }, - N, - scatterAlloc.getAllocatorHandle())); - - auto const slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); - auto const heapInfo = scatterAlloc.getHeapLocations().at(0); - std::cout << alpaka::trait::GetAccName::getAccName() << " slots: " << slots << " heap size: " << heapInfo.size - << '\n'; - - // free N*N*N * N*N*N allocations from N*N*N blocks of N*N*N threads for - // ints - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(N), - alpaka::Vec::all(N), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto const idx = alpaka::getIdx(acc); - allocHandle.free(acc, deviceArray[idx[0] * dim * dim * dim * dim + idx[1] * dim * dim + idx[0]]); - }, - N, - scatterAlloc.getAllocatorHandle())); - - // free 1 allocation from 1 thread for N*N*N * N*N*N pointers - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{ - alpaka::Vec::all(1), - alpaka::Vec::all(1), - alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) - { allocHandle.free(acc, deviceArray); }, - scatterAlloc.getAllocatorHandle())); -} - -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) -TEST_CASE("1D AccGpuCudaRt") -{ - test1D(); -} - -TEST_CASE("2D AccGpuCudaRt") -{ - test2D(); -} - -TEST_CASE("3D AccGpuCudaRt") -{ - test3D(); -} -#endif - -#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) -TEST_CASE("1D AccGpuHipRt") -{ - test1D(); -} - -TEST_CASE("2D AccGpuHipRt") -{ - test2D(); -} - -TEST_CASE("3D AccGpuHipRt") -{ - test3D(); -} -#endif - -#if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) -TEST_CASE("1D AccCpuThreads") -{ - test1D(); -} - -TEST_CASE("2D AccCpuThreads") -{ - test2D(); -} - -TEST_CASE("3D AccCpuThreads") -{ - test3D(); -} -#endif - -#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) -TEST_CASE("1D AccCpuOmp2Threads") -{ - test1D(); -} - -TEST_CASE("2D AccCpuOmp2Threads") -{ - test2D(); -} - -TEST_CASE("3D AccCpuOmp2Threads") -{ - test3D(); -} -#endif - -#if defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) -TEST_CASE("1D AccCpuOmp2Blocks") -{ - test1D(); -} - -TEST_CASE("2D AccCpuOmp2Blocks") -{ - test2D(); -} - -TEST_CASE("3D AccCpuOmp2Blocks") -{ - test3D(); -} -#endif - -#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) -TEST_CASE("1D AccCpuSerial") -{ - test1D(); -} - -TEST_CASE("2D AccCpuSerial") -{ - test2D(); -} - -TEST_CASE("3D AccCpuSerial") -{ - test3D(); -} -#endif diff --git a/tests/main.cpp b/tests/main.cpp deleted file mode 100644 index 4ed06df1..00000000 --- a/tests/main.cpp +++ /dev/null @@ -1,2 +0,0 @@ -#define CATCH_CONFIG_MAIN -#include diff --git a/tests/policies.cpp b/tests/policies.cpp deleted file mode 100644 index 5e9c9cb3..00000000 --- a/tests/policies.cpp +++ /dev/null @@ -1,189 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - - Copyright 2020 Helmholtz-Zentrum Dresden - Rossendorf, - CERN - - Author(s): Bernhard Manfred Gruber - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include - -#include -#include - -using Idx = std::size_t; -using Dim = alpaka::DimInt<1>; -using Acc = alpaka::AccGpuCudaRt; - -struct ScatterConfig -{ - static constexpr auto pagesize = 4096; - static constexpr auto accessblocksize = 256U * 1024U; - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = 2; - static constexpr auto resetfreedpages = false; -}; - -struct ScatterHashParams -{ - static constexpr auto hashingK = 38183; - static constexpr auto hashingDistMP = 17497; - static constexpr auto hashingDistWP = 1; - static constexpr auto hashingDistWPRel = 1; -}; - -struct DistributionConfig -{ - static constexpr auto pagesize = ScatterConfig::pagesize; -}; - -struct AlignmentConfig -{ - static constexpr auto dataAlignment = 16; -}; - -template -void run() -{ - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - - auto queue = alpaka::Queue{dev}; - - ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}, - [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) - { - auto* ptr = allocHandle.malloc(acc, sizeof(int) * 1000); - allocHandle.free(acc, ptr); - }, - scatterAlloc.getAllocatorHandle())); -} - -TEST_CASE("Scatter XMallocSIMD ReturnNull AlpakaBuf Shrink") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - run(); -} - -TEST_CASE("Scatter XMallocSIMD ReturnNull AlpakaBuf Noop") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Noop>; - run(); -} - -TEST_CASE("Scatter Noop ReturnNull AlpakaBuf Shrink") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; - run(); -} - -TEST_CASE("Scatter Noop ReturnNull AlpakaBuf Noop") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Noop>; - run(); -} - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -TEST_CASE("OldMalloc XMallocSIMD ReturnNull CudaSetLimits Shrink") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::OldMalloc, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::CudaSetLimits, - mallocMC::AlignmentPolicies::Shrink>; - run(); - - cudaDeviceReset(); -} - -TEST_CASE("OldMalloc XMallocSIMD ReturnNull CudaSetLimits Noop") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::OldMalloc, - mallocMC::DistributionPolicies::XMallocSIMD, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::CudaSetLimits, - mallocMC::AlignmentPolicies::Noop>; - run(); - - cudaDeviceReset(); -} - -TEST_CASE("OldMalloc Noop ReturnNull CudaSetLimits Shrink") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::OldMalloc, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::CudaSetLimits, - mallocMC::AlignmentPolicies::Shrink>; - run(); - - cudaDeviceReset(); -} - -TEST_CASE("OldMalloc Noop ReturnNull CudaSetLimits Noop") -{ - using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::OldMalloc, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::CudaSetLimits, - mallocMC::AlignmentPolicies::Noop>; - run(); - - cudaDeviceReset(); -} -#endif diff --git a/tests/thread-safety/AccessBlock.cpp b/tests/thread-safety/AccessBlock.cpp new file mode 100644 index 00000000..62811a44 --- /dev/null +++ b/tests/thread-safety/AccessBlock.cpp @@ -0,0 +1,927 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include "../unit/mocks.hpp" +#include "mallocMC/mallocMC_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock; + +using Dim = alpaka::DimInt<1>; +using Idx = std::uint32_t; + + +constexpr uint32_t pageSize = 1024; +constexpr uint32_t numPages = 4; +// Page table entry size = sizeof(chunkSize) + sizeof(fillingLevel): +constexpr uint32_t pteSize = 4 + 4; +constexpr uint32_t blockSize = numPages * (pageSize + pteSize); + +using MyAccessBlock = AccessBlock, AlignmentPolicy>; +using std::span; + +// Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the +// behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen +// regardless of the underlying access optimisations etc. + +struct FillWith +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + AccessBlock, AlignmentPolicy>* accessBlock, + uint32_t const chunkSize, + void** result, + uint32_t const size) const -> void + { + std::generate( + result, + result + size, + [&acc, accessBlock, chunkSize]() + { + void* pointer{nullptr}; + while(pointer == nullptr) + { + pointer = accessBlock->create(acc, chunkSize); + } + return pointer; + }); + } +}; + +struct ContentGenerator +{ + uint32_t counter{0U}; + + ALPAKA_FN_ACC auto operator()() -> uint32_t + { + return counter++; + } +}; + +ALPAKA_FN_ACC auto forAll(auto const& acc, auto size, auto functor) +{ + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < size) + { + functor(idx); + } + } +} + +struct Create +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSize); }); + } + + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto* chunkSizes) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSizes[idx]); }); + } +}; + +struct CreateUntilSuccess +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct Destroy +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers) const + { + forAll(acc, pointers.size(), [&](auto idx) { accessBlock->destroy(acc, pointers[idx]); }); + } +}; + +struct IsValid +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + auto* accessBlock, + void** pointers, + bool* results, + uint32_t const size) const + { + std::span tmpPointers(pointers, size); + std::span tmpResults(results, size); + std::transform( + std::begin(tmpPointers), + std::end(tmpPointers), + std::begin(tmpResults), + [&acc, accessBlock](auto pointer) { return accessBlock->isValid(acc, pointer); }); + } +}; + +using Host = alpaka::AccCpuSerial; + +template +struct Buffer +{ + TDevAcc m_devAcc; + TDevHost m_devHost; + + alpaka::Vec m_extents; + + alpaka::Buf m_onDevice; + alpaka::Buf m_onHost; + + Buffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) + : m_devAcc{devAcc} + , m_devHost{devHost} + , m_extents{extents} + , m_onDevice(alpaka::allocBuf(devAcc, m_extents)) + , m_onHost(alpaka::allocBuf(devHost, m_extents)) + { + } +}; + +template +auto makeBuffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) +{ + return Buffer{devHost, devAcc, extents}; +} + +auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) +{ + auto chunkSizes = makeBuffer(devHost, devAcc, 2U); + chunkSizes.m_onHost[0] = 32U; + chunkSizes.m_onHost[1] = 512U; + alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); + return chunkSizes; +} + +auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) +{ + auto pointers = makeBuffer(devHost, devAcc, size); + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + return pointers; +} + +template +auto setup() +{ + alpaka::Platform const platformAcc = {}; + alpaka::Platform> const platformHost = {}; + alpaka::Dev> const devAcc(alpaka::getDevByIdx(platformAcc, 0)); + alpaka::Dev> const devHost(alpaka::getDevByIdx(platformHost, 0)); + alpaka::Queue queue{devAcc}; + return std::make_tuple(platformAcc, platformHost, devAcc, devHost, queue); +} + +template +auto createWorkDiv(auto const& devAcc, auto const numElements, auto... args) -> alpaka::WorkDivMembers +{ + if constexpr(std::is_same_v, alpaka::TagCpuSerial>) + { + return {{1U}, {1U}, {numElements}}; + } + else + { + alpaka::KernelCfg const kernelCfg + = {numElements, 1, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + return alpaka::getValidWorkDiv(kernelCfg, devAcc, args...); + } +} + +template +auto fillWith(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + FillWith{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(pointers.m_onDevice), + pointers.m_extents[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); +} + +template +auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + fillWith(queue, accessBlock, chunkSize, pointers); + auto* pointer1 = pointers.m_onHost[0]; + + // Destroy exactly one pointer (i.e. the first). This is non-destructive on the actual values in + // devPointers, so we don't need to wait for the copy before to finish. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U)); + alpaka::wait(queue); + return pointer1; +} + +template +auto freeAllButOneOnFirstPage( + auto& queue, + AccessBlock, AlignmentPolicy>* accessBlock, + auto& pointers) +{ + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmp), std::end(tmp)); + // This points to the first chunk of page 0. + auto* pointer1 = tmp[0]; + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + alpaka::wait(queue); + auto size + = pointers.m_extents[0] / AccessBlock, AlignmentPolicy>::numPages() - 1; + // Delete all other chunks on page 0. + customExec( + queue, + pointers.m_devAcc, + size, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice) + 1U, size)); + alpaka::wait(queue); + return pointer1; +} + +struct CheckContent +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* content, span pointers, auto* results, auto chunkSize) + const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + results[idx] = std::all_of(begin, end, [idx, content](auto val) { return val == content[idx]; }); + } + } + } +}; + +template +auto checkContent( + auto& devHost, + auto& devAcc, + auto& queue, + auto& pointers, + auto& content, + auto& workDiv, + auto const chunkSize) +{ + auto results = makeBuffer(devHost, devAcc, pointers.m_extents[0]); + alpaka::exec( + queue, + workDiv, + CheckContent{}, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + alpaka::getPtrNative(results.m_onDevice), + chunkSize); + alpaka::wait(queue); + alpaka::memcpy(queue, results.m_onHost, results.m_onDevice); + alpaka::wait(queue); + + + std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); + auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); + + return writtenCorrectly; +} + +struct GetAvailableSlots +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, auto chunkSize, auto* result) const + { + *result = accessBlock->getAvailableSlots(acc, chunkSize); + }; +}; + +template +auto getAvailableSlots(auto* accessBlock, auto& queue, auto const& devHost, auto const& devAcc, auto chunkSize) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::wait(queue); + auto result = makeBuffer(devHost, devAcc, 1U); + alpaka::wait(queue); + alpaka::exec( + queue, + workDivSingleThread, + GetAvailableSlots{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(result.m_onDevice)); + alpaka::wait(queue); + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + auto tmp = result.m_onHost[0]; + alpaka::wait(queue); + return tmp; +} + +template +auto pageIndex(AccessBlock, AlignmentPolicy>* accessBlock, auto* pointer) +{ + // This is a bit dirty: What we should do here is enqueue a kernel that calls accessBlock->pageIndex(). + // But we assume that the access block starts with the first page, so the pointer to the first page equals the + // pointer to the access block. Not sure if this is reliable if the pointers are device pointers. + return mallocMC::indexOf(pointer, accessBlock, T_pageSize); +} + +struct FillAllUpAndWriteToThem +{ + ALPAKA_FN_ACC auto operator()( + auto const& acc, + auto* accessBlock, + auto* content, + span pointers, + auto chunkSize) const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + std::fill(begin, end, content[idx]); + } + } + } +}; + +struct CreateAndDestroMultipleTimes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + } + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct OversubscribedCreation +{ + uint32_t oversubscriptionFactor{}; + uint32_t availableSlots{}; + + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx + 1; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + + // CAUTION: The following lines have cost us more than a working day of debugging! + // If the hardware you're running on has a single program counter for the whole warp, the whole + // warp can't exit the while loop in case of even a single thread requesting another round. + // This implies that if we move the `.destroy()` out of the while loop, all the slots get + // filled up but the owning threads run idle instead of freeing them up again because they are + // waiting for their last companions to give their okay for exiting the loop. This is, of + // course, a hopeless endeavour because all slots are filled (we are vastly oversubscribed in + // this scenario). So, this loop deadlocks and no thread ever exits. + // + // ... at least that's what we believe. If you're reading this comment, we might have been + // wrong about this. + if(pointers[idx] != nullptr) + { + accessBlock->destroy(acc, pointers[idx]); + } + } + pointers[idx] = nullptr; + } + + // We only keep some of the memory. In particular, we keep one chunk less than is available, + // such that threads looking for memory after we've finished can still find some. + while(pointers[idx] == nullptr and idx > (oversubscriptionFactor - 1) * availableSlots + 1) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct CreateAllChunkSizes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, span chunkSizes) + const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = accessBlock->create(acc, 1U); + + for(auto chunkSize : chunkSizes) + { + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + } + }); + } +}; + +template +auto customExec(auto& queue, auto const& devAcc, auto const numElements, auto... args) +{ + auto workDiv = createWorkDiv(devAcc, numElements, args...); + alpaka::exec(queue, workDiv, args...); + return workDiv; +} + +TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) +{ + using Acc = alpaka::TagToAcc; + auto [platformAcc, platformHost, devAcc, devHost, queue] = setup(); + auto accessBlockBuf = alpaka::allocBuf(devAcc, alpaka::Vec{1U}); + alpaka::memset(queue, accessBlockBuf, 0x00); + alpaka::wait(queue); + auto* accessBlock = alpaka::getPtrNative(accessBlockBuf); + auto const chunkSizes = createChunkSizes(devHost, devAcc, queue); + auto pointers = createPointers( + devHost, + devAcc, + queue, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + alpaka::wait(queue); + + SECTION("creates second memory somewhere else.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + SECTION("creates memory of different chunk size in different pages.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(accessBlock, pointers.m_onHost[0]) != pageIndex(accessBlock, pointers.m_onHost[1])); + } + + SECTION("creates partly for insufficient memory with same chunk size.") + { + uint32_t const size = 2U; + auto* lastFreeChunk = fillAllButOne(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + // Okay, so here we start the actual test. The situation is the following: + // There is a single chunk available. + // We try to do two allocations. + // So, we expect one to succeed and one to fail. + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK( + ((pointers.m_onHost[0] == lastFreeChunk and pointers.m_onHost[1] == nullptr) + or (pointers.m_onHost[1] == lastFreeChunk and pointers.m_onHost[0] == nullptr))); + } + + SECTION("does not race between clean up and create.") + { + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + auto freePage = pageIndex(accessBlock, freeAllButOneOnFirstPage(queue, accessBlock, pointers)); + + // Now, pointer1 is the last valid pointer to page 0. Destroying it will clean up the page. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + + alpaka::exec( + queue, + workDivSingleThread, + CreateUntilSuccess{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(accessBlock, pointers.m_onHost[0]) == freePage); + } + + SECTION("destroys two pointers of different size.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + customExec( + queue, + devAcc, + 2U, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto result = makeBuffer(devHost, devAcc, 2U); + customExec( + queue, + devAcc, + 1U, + IsValid{}, + accessBlock, + alpaka::getPtrNative(pointers.m_onDevice), + alpaka::getPtrNative(result.m_onDevice), + result.m_extents[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + + CHECK(not result.m_onHost[0]); + CHECK(not result.m_onHost[1]); + } + + SECTION("destroys two pointers of same size.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + customExec( + queue, + devAcc, + 2U, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto result = makeBuffer(devHost, devAcc, 2U); + result.m_onHost[0] = true; + result.m_onHost[1] = true; + alpaka::memcpy(queue, result.m_onDevice, result.m_onHost); + alpaka::wait(queue); + customExec( + queue, + devAcc, + 1U, + IsValid{}, + accessBlock, + alpaka::getPtrNative(pointers.m_onDevice), + alpaka::getPtrNative(result.m_onDevice), + result.m_extents[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + + CHECK(not result.m_onHost[0]); + CHECK(not result.m_onHost[1]); + } + + SECTION("fills up all chunks in parallel and writes to them.") + { + auto content = makeBuffer( + devHost, + devAcc, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); + std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); + alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); + alpaka::wait(queue); + + auto workDiv = customExec( + queue, + devAcc, + pointers.m_extents[0], + FillAllUpAndWriteToThem{}, + accessBlock, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + auto writtenCorrectly + = checkContent(devHost, devAcc, queue, pointers, content, workDiv, chunkSizes.m_onHost[0]); + CHECK(writtenCorrectly); + } + + SECTION("destroys all pointers simultaneously.") + { + auto const allSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const allSlotsOfDifferentSize + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + customExec( + queue, + devAcc, + pointers.m_extents[0], + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + auto result = makeBuffer(devHost, devAcc, pointers.m_extents[0]); + customExec( + queue, + devAcc, + 1U, + IsValid{}, + accessBlock, + alpaka::getPtrNative(pointers.m_onDevice), + alpaka::getPtrNative(result.m_onDevice), + result.m_extents[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + + std::span tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]); + CHECK(std::none_of(std::cbegin(tmpResults), std::cend(tmpResults), [](auto const val) { return val; })); + + CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); + CHECK( + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]) + == allSlotsOfDifferentSize); + } + + SECTION("creates and destroys multiple times.") + { + customExec( + queue, + devAcc, + pointers.m_extents[0], + CreateAndDestroMultipleTimes{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmpPointers), std::end(tmpPointers)); + CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); + } + + SECTION("can handle oversubscription.") + { + uint32_t oversubscriptionFactor = 2U; + auto availableSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + + // This is oversubscribed but we will only hold keep less than 1/oversubscriptionFactor of the memory in the + // end. + auto manyPointers = makeBuffer(devHost, devAcc, oversubscriptionFactor * availableSlots); + customExec( + queue, + devAcc, + manyPointers.m_extents[0], + OversubscribedCreation{oversubscriptionFactor, availableSlots}, + accessBlock, + span(alpaka::getPtrNative(manyPointers.m_onDevice), manyPointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, manyPointers.m_onHost, manyPointers.m_onDevice); + alpaka::wait(queue); + + // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a + // nullptr. + std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); + auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; + + CHECK(std::all_of( + std::begin(tmpManyPointers), + beginNonNull, + [](auto const pointer) { return pointer == nullptr; })); + + std::sort(beginNonNull, std::end(tmpManyPointers)); + CHECK(std::unique(beginNonNull, std::end(tmpManyPointers)) == std::end(tmpManyPointers)); + } + + SECTION("can handle many different chunk sizes.") + { + auto chunkSizes = makeBuffer(devHost, devAcc, pageSize); + std::span chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]); + std::iota(std::begin(chunkSizesSpan), std::end(chunkSizesSpan), 1U); + alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); + alpaka::wait(queue); + + customExec( + queue, + devAcc, + MyAccessBlock::numPages(), + CreateAllChunkSizes{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), MyAccessBlock::numPages()), + std::span(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0])); + + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages()); + std::sort(std::begin(tmpPointers), std::end(tmpPointers)); + CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); + } + + SECTION("creates second memory somewhere in multi-page mode.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + pageSize); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + alpaka::wait(queue); +} diff --git a/tests/thread-safety/BitField.cpp b/tests/thread-safety/BitField.cpp new file mode 100644 index 00000000..f31decc5 --- /dev/null +++ b/tests/thread-safety/BitField.cpp @@ -0,0 +1,92 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "../unit/mocks.hpp" + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMask; +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskSize; +using namespace std::chrono_literals; + +// The following test is a particular regression test which (in its current form) requires to be able to stop a +// thread from the outside. This is not possible through the alpaka interface. Thus, we resort to running this with +// `std::jthread` but we have to ensure that the alpaka atomics work. Thus, the ifdef. +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + +TEST_CASE("Threaded BitMask") +{ + BitMask mask{}; + + SECTION("finds first free bit despite noise.") + { + // This is a regression test. An earlier version of this algorithm used to fail when other parts of the bit + // mask experienced frequent change during the search. We simulate this by letting a "noise thread" toggle + // unimportant bits while a "search thread" tries to find the first free bit. While the noise does not affect + // the result, a previous version of the algorithm does fail under these conditions (as verified by + // experiment). + + uint32_t const firstFreeIndex = GENERATE(0U, 1U, 10U); + for(uint32_t i = 0; i < firstFreeIndex; ++i) + { + mask.set(accSerial, i); + } + + uint32_t result = BitMaskSize; + auto noiseThread = std::jthread( + [&mask, firstFreeIndex](std::stop_token const& stopToken) + { + while(not stopToken.stop_requested()) + { + for(uint32_t i = firstFreeIndex + 1; i < BitMaskSize; ++i) + { + mask.flip(accSerial, i); + } + } + }); + std::thread([&mask, &result]() { result = mask.firstFreeBit(accSerial); }).join(); + std::this_thread::sleep_for(20ms); + CHECK(result == firstFreeIndex); + noiseThread.request_stop(); + } +} +#else +TEST_CASE("Threaded BitMask", "[!shouldfail]") +{ + FAIL("The Threaded BitMask regression test could not run because it is only available with the std::threads " + "backend enabled."); +} +#endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED diff --git a/tests/thread-safety/Scatter.cpp b/tests/thread-safety/Scatter.cpp new file mode 100644 index 00000000..95fc5a7a --- /dev/null +++ b/tests/thread-safety/Scatter.cpp @@ -0,0 +1,859 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + + +#include "mallocMC/creationPolicies/Scatter.hpp" + +#include "../unit/mocks.hpp" +#include "mallocMC/alignmentPolicies/Shrink.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/device_allocator.hpp" +#include "mallocMC/distributionPolicies/Noop.hpp" +#include "mallocMC/mallocMC_utils.hpp" +#include "mallocMC/oOMPolicies/ReturnNull.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Dim = alpaka::DimInt<1>; +using Idx = std::uint32_t; + + +constexpr uint32_t pageSize = 1024; +constexpr uint32_t numPages = 4; +// Page table entry size = sizeof(chunkSize) + sizeof(fillingLevel): +constexpr uint32_t pteSize = 8 + 4 + 4; +constexpr uint32_t blockSize = numPages * (pageSize + pteSize); + +template +struct ScatterHeapConfig +{ + static constexpr uint32_t const accessblocksize = T_blockSize; + static constexpr uint32_t const pagesize = T_pageSize; + static constexpr uint32_t const wastefactor = T_wasteFactor; + static constexpr uint32_t const regionsize = 1U; + static constexpr bool const resetfreedpages = true; +}; + +using MyScatter = mallocMC::CreationPolicies::Scatter< + ScatterHeapConfig>::AlignmentAwarePolicy>; +using MyDeviceAllocator = mallocMC::DeviceAllocator< + MyScatter, + mallocMC::DistributionPolicies::Noop, + mallocMC::OOMPolicies::ReturnNull, + mallocMC::AlignmentPolicies::Shrink<>>; + +using std::span; + +// Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the +// behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen +// regardless of the underlying access optimisations etc. + +struct FillWith +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + auto* accessBlock, + uint32_t const chunkSize, + void** result, + uint32_t const size) const -> void + { + std::generate( + result, + result + size, + [&acc, accessBlock, chunkSize]() + { + void* pointer{nullptr}; + while(pointer == nullptr) + { + pointer = accessBlock->create(acc, chunkSize); + } + return pointer; + }); + } +}; + +struct ContentGenerator +{ + uint32_t counter{0U}; + + ALPAKA_FN_ACC auto operator()() -> uint32_t + { + return counter++; + } +}; + +ALPAKA_FN_ACC auto forAll(auto const& acc, auto size, auto functor) +{ + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < size) + { + functor(idx); + } + } +} + +struct Create +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSize); }); + } + + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto* chunkSizes) const + { + forAll(acc, pointers.size(), [&](auto idx) { pointers[idx] = accessBlock->create(acc, chunkSizes[idx]); }); + } +}; + +struct CreateUntilSuccess +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct Destroy +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, auto* accessBlock, span pointers) const + { + forAll(acc, pointers.size(), [&](auto idx) { accessBlock->destroy(acc, pointers[idx]); }); + } +}; + +using Host = alpaka::AccCpuSerial; + +template +struct Buffer +{ + TDevAcc m_devAcc; + TDevHost m_devHost; + + alpaka::Vec m_extents; + + alpaka::Buf m_onDevice; + alpaka::Buf m_onHost; + + Buffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) + : m_devAcc{devAcc} + , m_devHost{devHost} + , m_extents{extents} + , m_onDevice(alpaka::allocBuf(devAcc, m_extents)) + , m_onHost(alpaka::allocBuf(devHost, m_extents)) + { + } +}; + +template +auto makeBuffer(TDevHost const& devHost, TDevAcc const& devAcc, auto extents) +{ + return Buffer{devHost, devAcc, extents}; +} + +auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) +{ + auto chunkSizes = makeBuffer(devHost, devAcc, 2U); + chunkSizes.m_onHost[0] = 32U; + chunkSizes.m_onHost[1] = 512U; + alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); + return chunkSizes; +} + +auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) +{ + auto pointers = makeBuffer(devHost, devAcc, size); + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + return pointers; +} + +template +auto setup() +{ + alpaka::Platform const platformAcc = {}; + alpaka::Platform> const platformHost = {}; + alpaka::Dev> const devAcc(alpaka::getDevByIdx(platformAcc, 0)); + alpaka::Dev> const devHost(alpaka::getDevByIdx(platformHost, 0)); + alpaka::Queue queue{devAcc}; + return std::make_tuple(platformAcc, platformHost, devAcc, devHost, queue); +} + +template +auto createWorkDiv(auto const& devAcc, auto const numElements, auto... args) -> alpaka::WorkDivMembers +{ + if constexpr(std::is_same_v, alpaka::TagCpuSerial>) + { + return {{1U}, {1U}, {numElements}}; + } + else + { + alpaka::KernelCfg const kernelCfg + = {numElements, 1, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + return alpaka::getValidWorkDiv(kernelCfg, devAcc, args...); + } +} + +template +auto fillWith(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + FillWith{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(pointers.m_onDevice), + pointers.m_extents[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); +} + +template +auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& pointers) +{ + fillWith(queue, accessBlock, chunkSize, pointers); + auto* pointer1 = pointers.m_onHost[0]; + + // Destroy exactly one pointer (i.e. the first). This is non-destructive on the actual values in + // devPointers, so we don't need to wait for the copy before to finish. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U)); + alpaka::wait(queue); + return pointer1; +} + +template +auto freeAllButOneOnFirstPage(auto& queue, auto* accessBlock, auto& pointers) +{ + std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmp), std::end(tmp)); + // This points to the first chunk of page 0. + auto* pointer1 = tmp[0]; + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); + alpaka::wait(queue); + auto size = pointers.m_extents[0] / numPages - 1; + // Delete all other chunks on page 0. + customExec( + queue, + pointers.m_devAcc, + size, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice) + 1U, size)); + alpaka::wait(queue); + return pointer1; +} + +struct CheckContent +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* content, span pointers, auto* results, auto chunkSize) + const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + results[idx] = std::all_of(begin, end, [idx, content](auto val) { return val == content[idx]; }); + } + } + } +}; + +template +auto checkContent( + auto& devHost, + auto& devAcc, + auto& queue, + auto& pointers, + auto& content, + auto& workDiv, + auto const chunkSize) +{ + auto results = makeBuffer(devHost, devAcc, pointers.m_extents[0]); + alpaka::exec( + queue, + workDiv, + CheckContent{}, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + alpaka::getPtrNative(results.m_onDevice), + chunkSize); + alpaka::wait(queue); + alpaka::memcpy(queue, results.m_onHost, results.m_onDevice); + alpaka::wait(queue); + + + std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); + auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); + + return writtenCorrectly; +} + +struct GetAvailableSlots +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, auto chunkSize, auto* result) const + { + *result = accessBlock->getAvailableSlots(acc, chunkSize); + }; +}; + +template +auto getAvailableSlots(auto* accessBlock, auto& queue, auto const& devHost, auto const& devAcc, auto chunkSize) +{ + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + alpaka::wait(queue); + auto result = makeBuffer(devHost, devAcc, 1U); + alpaka::wait(queue); + alpaka::exec( + queue, + workDivSingleThread, + GetAvailableSlots{}, + accessBlock, + chunkSize, + alpaka::getPtrNative(result.m_onDevice)); + alpaka::wait(queue); + alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); + alpaka::wait(queue); + auto tmp = result.m_onHost[0]; + alpaka::wait(queue); + return tmp; +} + +auto pageIndex(auto accessBlock, auto* pointer) +{ + // This is a bit dirty: What we should do here is enqueue a kernel that calls accessBlock->pageIndex(). + // But we assume that the access block starts with the first page, so the pointer to the first page equals the + // pointer to the access block. Not sure if this is reliable if the pointers are device pointers. + return mallocMC::indexOf(pointer, alpaka::getPtrNative(accessBlock), pageSize); +} + +struct FillAllUpAndWriteToThem +{ + ALPAKA_FN_ACC auto operator()( + auto const& acc, + auto* accessBlock, + auto* content, + span pointers, + auto chunkSize) const + { + auto const idx0 = alpaka::getIdx(acc)[0]; + auto const numElements = alpaka::getWorkDiv(acc)[0]; + for(uint32_t i = 0; i < numElements; ++i) + { + auto idx = idx0 + i; + if(idx < pointers.size()) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + auto* begin = reinterpret_cast(pointers[idx]); + auto* end = begin + chunkSize / sizeof(uint32_t); + std::fill(begin, end, content[idx]); + } + } + } +}; + +struct CreateAndDestroMultipleTimes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + } + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct OversubscribedCreation +{ + uint32_t oversubscriptionFactor{}; + uint32_t availableSlots{}; + + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, auto chunkSize) const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = nullptr; + for(uint32_t j = 0; j < idx + 1; ++j) + { + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + + // CAUTION: The following lines have cost us more than a working day of debugging! + // If the hardware you're running on has a single program counter for the whole warp, the whole + // warp can't exit the while loop in case of even a single thread requesting another round. + // This implies that if we move the `.destroy()` out of the while loop, all the slots get + // filled up but the owning threads run idle instead of freeing them up again because they are + // waiting for their last companions to give their okay for exiting the loop. This is, of + // course, a hopeless endeavour because all slots are filled (we are vastly oversubscribed in + // this scenario). So, this loop deadlocks and no thread ever exits. + // + // ... at least that's what we believe. If you're reading this comment, we might have been + // wrong about this. + if(pointers[idx] != nullptr) + { + accessBlock->destroy(acc, pointers[idx]); + } + } + pointers[idx] = nullptr; + } + + // We only keep some of the memory. In particular, we keep one chunk less than is available, + // such that threads looking for memory after we've finished can still find some. + while(pointers[idx] == nullptr and idx > (oversubscriptionFactor - 1) * availableSlots + 1) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + }); + } +}; + +struct CreateAllChunkSizes +{ + ALPAKA_FN_ACC auto operator()(auto const& acc, auto* accessBlock, span pointers, span chunkSizes) + const + { + forAll( + acc, + pointers.size(), + [&](auto idx) + { + pointers[idx] = accessBlock->create(acc, 1U); + + for(auto chunkSize : chunkSizes) + { + accessBlock->destroy(acc, pointers[idx]); + pointers[idx] = nullptr; + + // `.isValid()` is not thread-safe, so we use this direct assessment: + while(pointers[idx] == nullptr) + { + pointers[idx] = accessBlock->create(acc, chunkSize); + } + } + }); + } +}; + +template +auto customExec(auto& queue, auto const& devAcc, auto const numElements, auto... args) +{ + auto workDiv = createWorkDiv(devAcc, numElements, args...); + alpaka::exec(queue, workDiv, args...); + return workDiv; +} + +TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags) +{ + using Acc = alpaka::TagToAcc; + auto [platformAcc, platformHost, devAcc, devHost, queue] = setup(); + auto accessBlockBuf = alpaka::allocBuf(devAcc, alpaka::Vec{1U}); + auto dataBuf = alpaka::allocBuf, Idx>( + devAcc, + alpaka::Vec{1U}); + MyScatter::initHeap( + devAcc, + queue, + alpaka::getPtrNative(accessBlockBuf), + static_cast(alpaka::getPtrNative(dataBuf)), + blockSize); + alpaka::wait(queue); + auto* accessBlock = alpaka::getPtrNative(accessBlockBuf); + auto const chunkSizes = createChunkSizes(devHost, devAcc, queue); + auto pointers = createPointers( + devHost, + devAcc, + queue, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + alpaka::wait(queue); + + SECTION("creates second memory somewhere else.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + SECTION("creates memory of different chunk size in different pages.") + { + customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(dataBuf, pointers.m_onHost[0]) != pageIndex(dataBuf, pointers.m_onHost[1])); + } + + SECTION("creates partly for insufficient memory with same chunk size.") + { + uint32_t const size = 2U; + auto* lastFreeChunk = fillAllButOne(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + // Okay, so here we start the actual test. The situation is the following: + // There is a single chunk available. + // We try to do two allocations. + // So, we expect one to succeed and one to fail. + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK( + ((pointers.m_onHost[0] == lastFreeChunk and pointers.m_onHost[1] == nullptr) + or (pointers.m_onHost[1] == lastFreeChunk and pointers.m_onHost[0] == nullptr))); + } + + SECTION("does not race between clean up and create.") + { + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + auto freePage = pageIndex(dataBuf, freeAllButOneOnFirstPage(queue, accessBlock, pointers)); + + // Now, pointer1 is the last valid pointer to page 0. Destroying it will clean up the page. + alpaka::WorkDivMembers const workDivSingleThread{Idx{1}, Idx{1}, Idx{1}}; + + alpaka::exec( + queue, + workDivSingleThread, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + + alpaka::exec( + queue, + workDivSingleThread, + CreateUntilSuccess{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 1U), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pageIndex(dataBuf, pointers.m_onHost[0]) == freePage); + } + + SECTION("destroys two pointers of different size.") + { + auto workDiv = customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + alpaka::getPtrNative(chunkSizes.m_onDevice)); + alpaka::wait(queue); + + auto const beforeDestroy0 + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const beforeDestroy1 + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + + alpaka::exec( + queue, + workDiv, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto const afterDestroy0 = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const afterDestroy1 = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + + CHECK(beforeDestroy0 < afterDestroy0); + CHECK(beforeDestroy1 < afterDestroy1); + } + + SECTION("destroys two pointers of same size.") + { + auto workDiv = customExec( + queue, + devAcc, + 2U, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + auto const beforeDestroy = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + + alpaka::exec( + queue, + workDiv, + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), 2U)); + alpaka::wait(queue); + + auto const afterDestroy = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + CHECK(beforeDestroy == afterDestroy - 2U); + } + + SECTION("fills up all chunks in parallel and writes to them.") + { + auto content = makeBuffer( + devHost, + devAcc, + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); + std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); + std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); + alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); + alpaka::wait(queue); + + auto workDiv = customExec( + queue, + devAcc, + pointers.m_extents[0], + FillAllUpAndWriteToThem{}, + accessBlock, + alpaka::getPtrNative(content.m_onDevice), + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + + alpaka::wait(queue); + + auto writtenCorrectly + = checkContent(devHost, devAcc, queue, pointers, content, workDiv, chunkSizes.m_onHost[0]); + CHECK(writtenCorrectly); + } + + SECTION("destroys all pointers simultaneously.") + { + auto const allSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + auto const allSlotsOfDifferentSize + = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]); + fillWith(queue, accessBlock, chunkSizes.m_onHost[0], pointers); + + customExec( + queue, + devAcc, + pointers.m_extents[0], + Destroy{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0])); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); + CHECK( + getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[1]) + == allSlotsOfDifferentSize); + } + + SECTION("creates and destroys multiple times.") + { + customExec( + queue, + devAcc, + pointers.m_extents[0], + CreateAndDestroMultipleTimes{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), pointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + std::sort(std::begin(tmpPointers), std::end(tmpPointers)); + CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); + } + + SECTION("can handle oversubscription.") + { + uint32_t oversubscriptionFactor = 2U; + auto availableSlots = getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]); + + // This is oversubscribed but we will only hold keep less than 1/oversubscriptionFactor of the memory in the + // end. + auto manyPointers = makeBuffer(devHost, devAcc, oversubscriptionFactor * availableSlots); + customExec( + queue, + devAcc, + manyPointers.m_extents[0], + OversubscribedCreation{oversubscriptionFactor, availableSlots}, + accessBlock, + span(alpaka::getPtrNative(manyPointers.m_onDevice), manyPointers.m_extents[0]), + chunkSizes.m_onHost[0]); + alpaka::wait(queue); + + alpaka::memcpy(queue, manyPointers.m_onHost, manyPointers.m_onDevice); + alpaka::wait(queue); + + // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a + // nullptr. + std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); + auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; + + CHECK(std::all_of( + std::begin(tmpManyPointers), + beginNonNull, + [](auto const pointer) { return pointer == nullptr; })); + + std::sort(beginNonNull, std::end(tmpManyPointers)); + CHECK(std::unique(beginNonNull, std::end(tmpManyPointers)) == std::end(tmpManyPointers)); + } + + SECTION("creates second memory somewhere in multi-page mode.") + { + uint32_t const size = 2U; + customExec( + queue, + devAcc, + size, + Create{}, + accessBlock, + span(alpaka::getPtrNative(pointers.m_onDevice), size), + pageSize); + alpaka::wait(queue); + + alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); + alpaka::wait(queue); + + CHECK(pointers.m_onHost[0] != pointers.m_onHost[1]); + } + + alpaka::wait(queue); +} diff --git a/tests/unit/AccessBlock.cpp b/tests/unit/AccessBlock.cpp new file mode 100644 index 00000000..d44a5fb4 --- /dev/null +++ b/tests/unit/AccessBlock.cpp @@ -0,0 +1,532 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz, Rene Widera + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +template +struct TestableAccessBlock + : mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock +{ +public: + TestableAccessBlock() = default; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::blockSize; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::pageSize; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock::wasteFactor; + using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock:: + resetfreedpages; +}; + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskStorageType; +using mallocMC::CreationPolicies::FlatterScatterAlloc::PageInterpretation; + +constexpr uint32_t const pageTableEntrySize = 8U; +constexpr uint32_t const pageSize1 = 1024U; +constexpr uint32_t const pageSize2 = 4096U; + +using AccessBlocks = std::tuple< + TestableAccessBlock, AlignmentPolicy>, + TestableAccessBlock, AlignmentPolicy>, + TestableAccessBlock, AlignmentPolicy>, + TestableAccessBlock, AlignmentPolicy>>; + +template +auto fillWith(TestableAccessBlock& accessBlock, uint32_t const chunkSize) + -> std::vector +{ + std::vector pointers(accessBlock.getAvailableSlots(accSerial, chunkSize)); + std::generate( + std::begin(pointers), + std::end(pointers), + [&accessBlock, chunkSize]() + { + void* pointer = accessBlock.create(accSerial, chunkSize); + REQUIRE(pointer != nullptr); + return pointer; + }); + return pointers; +} + +template +struct SelectivelyWastedHeapConfig : HeapConfig +{ + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + auto currentWasteFactor = (numBytes == T_allowedToWasteNumBytes) ? T_wasteFactor : 1U; + return (chunkSize >= numBytes && chunkSize <= currentWasteFactor * numBytes); + } +}; + +TEMPLATE_LIST_TEST_CASE("AccessBlock", "", AccessBlocks) +{ + using AccessBlock = TestType; + constexpr auto const blockSize = AccessBlock::blockSize; + constexpr auto const pageSize = AccessBlock::pageSize; + + AccessBlock accessBlock{}; + + SECTION("knows its number of pages.") + { + // The overhead from the metadata is small enough that this just happens to round down to the correct values. + // If you choose weird numbers, it might no longer. + CHECK(accessBlock.numPages() == blockSize / pageSize); + } + + SECTION("knows its available slots.") + { + uint32_t const chunkSize = GENERATE(1U, 2U, 32U, 57U, 1024U); + // This is not exactly true. It is only true because the largest chunk size we chose above is exactly the size + // of one page. In general, this number would be fractional for larger than page size chunks but I don't want + // to bother right now: + uint32_t slotsPerPage = chunkSize < pageSize ? PageInterpretation::numChunks(chunkSize) : 1U; + + uint32_t numOccupied = GENERATE(0U, 1U, 10U); + uint32_t actualNumOccupied = numOccupied; + for(uint32_t i = 0; i < numOccupied; ++i) + { + if(accessBlock.create(accSerial, chunkSize) == nullptr) + { + actualNumOccupied--; + } + } + + auto totalSlots = accessBlock.numPages() * slotsPerPage; + if(totalSlots > actualNumOccupied) + { + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == totalSlots - actualNumOccupied); + } + else + { + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == 0U); + } + } + + constexpr uint32_t const chunkSize = 32U; + + SECTION("creates") + { + SECTION("no nullptr if memory is available.") + { + // This is not a particularly hard thing to do because any uninitialised pointer that could be returned is + // most likely not exactly the nullptr. We just leave this in as it currently doesn't hurt anybody to keep + // it. + CHECK(accessBlock.create(accSerial, chunkSize) != nullptr); + } + + SECTION("memory that can be written to and read from.") + { + uint32_t const arbitraryValue = 42; + auto* ptr = static_cast(accessBlock.create(accSerial, chunkSize)); + REQUIRE(ptr != nullptr); + *ptr = arbitraryValue; + CHECK(*ptr == arbitraryValue); + } + + SECTION("second memory somewhere else.") + { + CHECK(accessBlock.create(accSerial, chunkSize) != accessBlock.create(accSerial, chunkSize)); + } + + SECTION("memory of different chunk size in different pages.") + { + constexpr uint32_t const chunkSize2 = 512U; + REQUIRE(chunkSize != chunkSize2); + // To be precise, the second call will actually return a nullptr if there is only a single page (which is + // one of the test cases at the time of writing). But that technically passes this test, too. + + CHECK( + accessBlock.pageIndex(accessBlock.create(accSerial, chunkSize)) + != accessBlock.pageIndex(accessBlock.create(accSerial, chunkSize2))); + } + + SECTION("nullptr if there's no page with fitting chunk size") + { + // This requests one chunk of a different chunk size for each page. As a new page is required each time, + // all pages have a chunk size set at the end. And none of those is `chunkSize`. + for(uint32_t index = 0; index < accessBlock.numPages(); ++index) + { + auto const differentChunkSize = chunkSize + 1U + index; + REQUIRE(chunkSize != differentChunkSize); + accessBlock.create(accSerial, differentChunkSize); + } + + CHECK(accessBlock.create(accSerial, chunkSize) == nullptr); + } + + SECTION("nullptr if all pages have full filling level.") + { + fillWith(accessBlock, chunkSize); + CHECK(accessBlock.create(accSerial, chunkSize) == nullptr); + } + + SECTION("last remaining chunk.") + { + auto pointers = fillWith(accessBlock, chunkSize); + uint32_t const index = GENERATE(0U, 1U, 42U); + void* pointer = pointers[std::min(index, static_cast(pointers.size()) - 1)]; + accessBlock.destroy(accSerial, pointer); + CHECK(accessBlock.create(accSerial, chunkSize) == pointer); + } + + SECTION("memory larger than page size.") + { + if(accessBlock.numPages() >= 2U) + { + CHECK(accessBlock.isValid(accSerial, accessBlock.create(accSerial, 2U * pageSize))); + } + } + + SECTION("nullptr if chunkSize is larger than total available memory in pages.") + { + // larger than the available memory but in some cases smaller than the block size even after subtracting + // the space for the page table: + uint32_t const excessiveChunkSize = accessBlock.numPages() * pageSize + 1U; + CHECK(accessBlock.create(accSerial, excessiveChunkSize) == nullptr); + } + + SECTION("in the correct place for larger than page size.") + { + // we want to allocate 2 pages: + if(accessBlock.numPages() > 1U) + { + auto pointers = fillWith(accessBlock, pageSize); + std::sort(std::begin(pointers), std::end(pointers)); + + // Now, we free two contiguous chunks such that there is one deterministic spot wherefrom our request + // can be served. + uint32_t index = GENERATE(0U, 1U, 5U); + index = std::min(index, static_cast(pointers.size()) - 2U); + accessBlock.destroy(accSerial, pointers[index]); + accessBlock.destroy(accSerial, pointers[index + 1]); + + // Must be exactly where we free'd the pages: + CHECK( + accessBlock.pageIndex(accessBlock.create(accSerial, 2U * pageSize)) + == static_cast(index)); + } + } + + SECTION("a pointer and knows it's valid afterwards.") + { + void* pointer = accessBlock.create(accSerial, chunkSize); + CHECK(accessBlock.isValid(accSerial, pointer)); + } + + SECTION("the last pointer in page and its allocation does not reach into the bit field.") + { + auto slots = accessBlock.getAvailableSlots(accSerial, chunkSize); + // Find the last allocation on the first page: + auto pointers = fillWith(accessBlock, chunkSize); + std::sort(std::begin(pointers), std::end(pointers)); + auto lastOfPage0 = pointers[slots / accessBlock.numPages() - 1]; + + // Free the first bit of the bit field by destroying the first allocation in the first page: + accessBlock.destroy(accSerial, pointers[0]); + REQUIRE(not accessBlock.isValid(accSerial, pointers[0])); + + // Write all ones to the last of the first page: If there is an overlap between the region of the last + // chunk and the bit field, our recently free'd first chunk will have its bit set by this operation. + char* begin = reinterpret_cast(lastOfPage0); + auto* end = begin + chunkSize; + std::fill(begin, end, 255U); + + // Now, we try to allocate one more chunk. It must be the one we free'd before. + CHECK(accessBlock.create(accSerial, chunkSize) == pointers[0]); + REQUIRE(accessBlock.isValid(accSerial, pointers[0])); + } + + SECTION("and writes something very close to page size.") + { + // This is a regression test. The original version of the code started to use multi-page mode when numBytes + // >= pageSize. That is too late because if we're not in multi-page mode, we need to leave some space for + // the bit mask. Thus, the following test would corrupt the bit mask, if we were to allocate this in + // chunked mode. + +#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + REQUIRE(sizeof(BitMaskStorageType<>) > 1U); + auto localChunkSize = pageSize - 1U; + auto slots = accessBlock.getAvailableSlots(accSerial, localChunkSize); + auto pointer = accessBlock.create(accSerial, localChunkSize); + REQUIRE(slots == accessBlock.getAvailableSlots(accSerial, localChunkSize) + 1); + memset(pointer, 0, localChunkSize); + CHECK_NOTHROW(accessBlock.destroy(accSerial, pointer)); +#else + SUCCEED("This bug actually never had any observable behaviour in NDEBUG mode because the corrupted bit " + "mask is never read again."); +#endif // NDEBUG + } + + SECTION("with waste factor") + { + constexpr uint32_t const wastefactor = 3U; + TestableAccessBlock, AlignmentPolicy> wastedAccessBlock{}; + auto pointers = fillWith(wastedAccessBlock, chunkSize); + + auto smallerChunkSize = chunkSize / (wastefactor - 1U); + REQUIRE(smallerChunkSize < chunkSize); + + wastedAccessBlock.destroy(accSerial, pointers[0]); + + // Some consistency checks: Interpreting as an access block without waste factor, we'll surely have no + // available memory for this chunk size. + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, smallerChunkSize) + == 0U); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->create(accSerial, smallerChunkSize) == nullptr); + + SECTION("knows its available slots.") + { + CHECK(wastedAccessBlock.getAvailableSlots(accSerial, smallerChunkSize) == 1U); + } + + SECTION("creates a smaller chunk size.") + { + CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == pointers[0]); + } + + SECTION("fails to create too many smaller chunks.") + { + CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == pointers[0]); + CHECK(wastedAccessBlock.create(accSerial, smallerChunkSize) == nullptr); + } + + SECTION("is not misled by mixing above and below multi-page threshold.") + { + auto const aboveMultiPageThreshold = pageSize - 2 * sizeof(BitMaskStorageType<>); + auto const belowMultiPageThreshold = aboveMultiPageThreshold / (wastefactor - 1U); + for(auto const pointer : pointers) + { + // free one page we want to operate on + if(wastedAccessBlock.isValid(accSerial, pointer) and wastedAccessBlock.pageIndex(pointer) == 0U) + { + wastedAccessBlock.destroy(accSerial, pointer); + } + } + REQUIRE(wastedAccessBlock.getAvailableSlots(accSerial, belowMultiPageThreshold) == 2U); + REQUIRE(wastedAccessBlock.getAvailableSlots(accSerial, aboveMultiPageThreshold) == 1U); + + // This allocates in multi-page mode. + CHECK(wastedAccessBlock.pageIndex(wastedAccessBlock.create(accSerial, aboveMultiPageThreshold)) == 0U); + // This tries to allocate in chunked mode but the waste factor would allow to create on the just + // allocated page. This is, of course, forbidden. + CHECK(wastedAccessBlock.create(accSerial, aboveMultiPageThreshold) == nullptr); + } + } + + SECTION("with waste function") + { + constexpr uint32_t const wastefactor = 3U; + constexpr uint32_t const selectedNumBytes = mallocMC::ceilingDivision(chunkSize, wastefactor); + TestableAccessBlock< + SelectivelyWastedHeapConfig, + AlignmentPolicy> + wastedAccessBlock{}; + auto pointers = fillWith(wastedAccessBlock, chunkSize); + + auto notSelectedNumBytes = chunkSize / (wastefactor - 1U); + + // Okay, so we want a scenario where both selectedNumBytes and notSelectedNumBytes are within the range of + // the waste factor but only for selectedNumBytes we'll actually get a waste-factor-like behaviour. + REQUIRE(selectedNumBytes < chunkSize); + REQUIRE(selectedNumBytes * wastefactor >= chunkSize); + REQUIRE(selectedNumBytes < notSelectedNumBytes); + REQUIRE(notSelectedNumBytes < chunkSize); + + wastedAccessBlock.destroy(accSerial, pointers[0]); + + // Some consistency checks: Interpreting as an access block without waste factor, we'll surely have no + // available memory for these chunk sizes. + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, notSelectedNumBytes) + == 0U); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->getAvailableSlots(accSerial, selectedNumBytes) + == 0U); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->create(accSerial, selectedNumBytes) == nullptr); + REQUIRE( + reinterpret_cast(&wastedAccessBlock)->create(accSerial, notSelectedNumBytes) == nullptr); + + SECTION("knows its available slots.") + { + CHECK(wastedAccessBlock.getAvailableSlots(accSerial, selectedNumBytes) == 1U); + CHECK(wastedAccessBlock.getAvailableSlots(accSerial, notSelectedNumBytes) == 0U); + } + + SECTION("creates a smaller chunk size.") + { + CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); + CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == pointers[0]); + } + + SECTION("fails to create too many smaller chunks.") + { + CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); + CHECK(wastedAccessBlock.create(accSerial, notSelectedNumBytes) == nullptr); + CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == pointers[0]); + CHECK(wastedAccessBlock.create(accSerial, selectedNumBytes) == nullptr); + } + } + } + + SECTION("destroys") + { + void* pointer = accessBlock.create(accSerial, chunkSize); + REQUIRE(accessBlock.isValid(accSerial, pointer)); + + SECTION("a pointer thereby invalidating it.") + { + accessBlock.destroy(accSerial, pointer); + CHECK(not accessBlock.isValid(accSerial, pointer)); + } + + SECTION("the whole page if last pointer is destroyed.") + { + REQUIRE(chunkSize != pageSize); + REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages() - 1); + accessBlock.destroy(accSerial, pointer); + CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); + } + + SECTION("not the whole page if there still exists a valid pointer.") + { + REQUIRE(chunkSize != pageSize); + auto unOccupiedPages = accessBlock.numPages(); + void* newPointer{nullptr}; + // We can't be sure which page is used for any allocation, so we allocate again and again until we have hit + // a page that already has an allocation: + while(accessBlock.getAvailableSlots(accSerial, pageSize) != unOccupiedPages) + { + unOccupiedPages = accessBlock.getAvailableSlots(accSerial, pageSize); + newPointer = accessBlock.create(accSerial, chunkSize); + } + accessBlock.destroy(accSerial, newPointer); + CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == unOccupiedPages); + } + + SECTION("one slot without touching the others.") + { + // this won't be touched: + accessBlock.create(accSerial, chunkSize); + auto originalSlots = accessBlock.getAvailableSlots(accSerial, chunkSize); + accessBlock.destroy(accSerial, pointer); + CHECK(accessBlock.getAvailableSlots(accSerial, chunkSize) == originalSlots + 1U); + } + + SECTION("no invalid pointer but throws instead.") + { +#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + pointer = nullptr; + CHECK_THROWS( + accessBlock.destroy(accSerial, pointer), + std::runtime_error{"Attempted to destroy an invalid pointer!"}); +#endif // NDEBUG + } + + SECTION("pointer for larger than page size") + { + if(accessBlock.numPages() > 1U) + { + accessBlock.destroy(accSerial, pointer); + REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); + + pointer = accessBlock.create(accSerial, 2U * pageSize); + REQUIRE(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages() - 2); + REQUIRE(accessBlock.isValid(accSerial, pointer)); + + accessBlock.destroy(accSerial, pointer); + + SECTION("thereby invalidating it.") + { + CHECK(not accessBlock.isValid(accSerial, pointer)); + } + + SECTION("thereby freeing up their pages.") + { + CHECK(accessBlock.getAvailableSlots(accSerial, pageSize) == accessBlock.numPages()); + } + } + } + + SECTION("and doesn't reset the page.") + { + auto& unresettingAccessBlock = *reinterpret_cast< + TestableAccessBlock, AlignmentPolicy>*>( + &accessBlock); + auto const differentChunkSize = GENERATE(17, 2048); + REQUIRE(differentChunkSize != chunkSize); + auto const slots = unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize); + + unresettingAccessBlock.destroy(accSerial, pointer); + CHECK(unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize) == slots); + } + + SECTION("and always resets the page for larger than page size.") + { + auto& unresettingAccessBlock = *reinterpret_cast< + TestableAccessBlock, AlignmentPolicy>*>( + &accessBlock); + auto const differentChunkSize = GENERATE(17, 2048); + auto const slots = unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize); + auto* largePointer = accessBlock.create(accSerial, pageSize); + if(largePointer != nullptr) + { + REQUIRE(differentChunkSize != chunkSize); + + unresettingAccessBlock.destroy(accSerial, largePointer); + CHECK(unresettingAccessBlock.getAvailableSlots(accSerial, differentChunkSize) == slots); + } + } + } +} diff --git a/tests/unit/BitField.cpp b/tests/unit/BitField.cpp new file mode 100644 index 00000000..e7912891 --- /dev/null +++ b/tests/unit/BitField.cpp @@ -0,0 +1,247 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitFieldFlatImpl; +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskImpl; + +using BitMaskSizes = std::tuple< + std::integral_constant, // NOLINT(*magic-number*) + std::integral_constant, // NOLINT(*magic-number*) + std::integral_constant>; // NOLINT(*magic-number*) + +TEMPLATE_LIST_TEST_CASE("BitMask", "", BitMaskSizes) +{ + constexpr uint32_t const BitMaskSize = TestType::value; + using BitMask = BitMaskImpl; + BitMask mask{}; + + SECTION("is initialised to 0.") + { + CHECK(mask == 0U); + } + + SECTION("can have individual bits read.") + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + CHECK(mask(accSerial, i) == false); + } + } + + SECTION("allows to write individual bits.") + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + mask.set(accSerial, i); + CHECK(mask(accSerial, i)); + } + } + + SECTION("allows to unset individual bits afterwards.") + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + mask.set(accSerial, i); + for(uint32_t j = 0; j < BitMaskSize; ++j) + { + CHECK(mask(accSerial, j) == (i == j)); + } + mask.unset(accSerial, i); + } + } + + + SECTION("knows the first free bit.") + { + mask.flip(accSerial); + uint32_t const index = GENERATE(0, 3); + mask.flip(accSerial, index); + CHECK(mask.firstFreeBit(accSerial, BitMaskSize) == index); + } + + SECTION("returns BitMaskSize as first free bit if there is none.") + { + mask.flip(accSerial); + CHECK(mask.firstFreeBit(accSerial, BitMaskSize) == BitMaskSize); + } + + SECTION("knows the first free bit with startIndex.") + { + mask.set(accSerial); + uint32_t index1 = GENERATE(0, 5); + uint32_t index2 = GENERATE(0, 11); + if(index1 > index2) + { + std::swap(index1, index2); + } + uint32_t const startIndex = GENERATE(0, 4, 5, 6); + mask.unset(accSerial, index1); + mask.unset(accSerial, index2); + // This is the currently implemented algorithm and could be considered overspecifying the result. + // The minimal requirement we should have is that firstFreeBit is an element of {index1, index2}. + CHECK(mask.firstFreeBit(accSerial, BitMaskSize, startIndex) == ((startIndex == index2) ? index2 : index1)); + } +} + +TEMPLATE_LIST_TEST_CASE("BitFieldFlat", "", BitMaskSizes) +{ + constexpr uint32_t const BitMaskSize = TestType::value; + using BitMask = BitMaskImpl; + using BitFieldFlat = BitFieldFlatImpl; + + // This is potentially larger than we actually need but that's okay: + constexpr uint32_t const numChunks = 256U; + constexpr uint32_t const numMasks = mallocMC::ceilingDivision(numChunks, BitMaskSize); + BitMask data[numMasks]; + + SECTION("knows its only free bit.") + { + uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); + for(auto& mask : data) + { + mask.set(accSerial); + } + data[index / BitMaskSize].unset(accSerial, index % BitMaskSize); + + // Just to be sure: The masks look as expected. + for(uint32_t j = 0; j < numMasks; ++j) + { + for(uint32_t i = 0; i < BitMaskSize; ++i) + { + REQUIRE(data[j](accSerial, i) == (j * BitMaskSize + i != index)); + } + } + + BitFieldFlat field{data}; + + CHECK(field.firstFreeBit(accSerial, numChunks) == index); + } + + SECTION("knows a free bit if later ones are free, too.") + { + uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); + for(auto& mask : std::span{static_cast(data), index / BitMaskSize}) + { + mask.set(accSerial); + } + for(uint32_t i = 0; i < index % BitMaskSize; ++i) + { + data[index / BitMaskSize].set(accSerial, i); + } + + BitFieldFlat field{data}; + + CHECK(field.firstFreeBit(accSerial, numChunks) >= index); + } + + SECTION("knows its first free bit for different numChunks.") + { + auto localNumChunks = numChunks / GENERATE(1, 2, 3); + std::span localData{static_cast(data), mallocMC::ceilingDivision(localNumChunks, BitMaskSize)}; + uint32_t const index = GENERATE(0, 1, 10, 12); + for(auto& mask : localData) + { + mask.set(accSerial); + } + localData[index / BitMaskSize].unset(accSerial, index % BitMaskSize); + + BitFieldFlat field{localData}; + + CHECK(field.firstFreeBit(accSerial, numChunks) == index); + } + + SECTION("sets a bit.") + { + BitFieldFlat field{data}; + uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); + field.set(accSerial, index); + for(uint32_t i = 0; i < numChunks; ++i) + { + CHECK(field.get(accSerial, i) == (i == index)); + } + } + + SECTION("sets two bits.") + { + BitFieldFlat field{data}; + uint32_t const firstIndex = GENERATE(0, 1, numChunks / 2, numChunks - 1); + uint32_t const secondIndex = GENERATE(2, numChunks / 3, numChunks / 2, numChunks - 1); + field.set(accSerial, firstIndex); + field.set(accSerial, secondIndex); + for(uint32_t i = 0; i < numChunks; ++i) + { + CHECK(field.get(accSerial, i) == (i == firstIndex || i == secondIndex)); + } + } + + SECTION("returns numChunks if no free bit is found.") + { + BitFieldFlat field{data}; + for(uint32_t i = 0; i < numChunks; ++i) + { + field.set(accSerial, i); + } + CHECK(field.firstFreeBit(accSerial, numChunks) == numChunks); + } + + SECTION("returns numChunks if free bit is not valid.") + { + BitFieldFlat field{data}; + uint32_t const numValidBits = GENERATE(1, numChunks / 2, numChunks - 1); + for(uint32_t i = 0; i < numValidBits; ++i) + { + // We are filling up all valid bits. + field.set(accSerial, i); + } + CHECK(field.firstFreeBit(accSerial, numValidBits) == numChunks); + } + + SECTION("returns numChunks if free bit is not valid.") + { + BitFieldFlat field{data}; + uint32_t const numValidBits = GENERATE(1, numChunks / 2, numChunks - 1); + for(uint32_t i = 0; i < numValidBits; ++i) + { + // We are filling up all valid bits. + field.set(accSerial, i); + } + CHECK(field.firstFreeBit(accSerial, numValidBits) == numChunks); + } +} diff --git a/tests/unit/PageInterpretation.cpp b/tests/unit/PageInterpretation.cpp new file mode 100644 index 00000000..21d6c598 --- /dev/null +++ b/tests/unit/PageInterpretation.cpp @@ -0,0 +1,316 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" + +#include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" +#include "mallocMC/creationPolicies/FlatterScatter/DataPage.hpp" +#include "mallocMC/mallocMC_utils.hpp" +#include "mocks.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMask; +using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskSize; +using mallocMC::CreationPolicies::FlatterScatterAlloc::DataPage; +using mallocMC::CreationPolicies::FlatterScatterAlloc::PageInterpretation; +using std::distance; + +template +constexpr std::array const + chunkSizesForReportingTests{1, 2, 4, 5, 10, 11, 31, 32, 512}; // NOLINT(*magic-number*) + +template +constexpr std::array const expectedNumChunksForReportingTests{}; + +template<> +constexpr std::array const + expectedNumChunksForReportingTests<32U>{908, 480, 248, 199, 100, 92, 32, 31, 1}; // NOLINT(*magic-number*) + +template<> +constexpr std::array const + expectedNumChunksForReportingTests<64U>{904, 480, 248, 198, 100, 91, 32, 31, 1}; // NOLINT(*magic-number*) + +TEST_CASE("PageInterpretation") +{ + constexpr uint32_t const pageSize = 1024U; + constexpr uint32_t const chunkSize = 32U; + DataPage data{}; + PageInterpretation page{data, chunkSize}; + + SECTION("refers to the same data it was created with.") + { + CHECK(&data == page.chunkPointer(0)); + } + + SECTION("returns start of data as first chunk.") + { + CHECK(page.chunkPointer(0) == &data); + } + + SECTION("computes correct number of chunks.") + { + for(uint32_t i = 0U; i < chunkSizesForReportingTests.size(); ++i) + { + CHECK( + PageInterpretation::numChunks(chunkSizesForReportingTests[i]) + == expectedNumChunksForReportingTests[i]); + } + } + + SECTION("jumps by chunkSize between indices.") + { + for(auto i = 0U; i < (pageSize / chunkSize) - 1; ++i) + { + CHECK( + distance( + reinterpret_cast(page.chunkPointer(i)), + reinterpret_cast(page.chunkPointer(i + 1))) + == chunkSize); + } + } + + SECTION("knows the maximal bit field size.") + { + CHECK( + page.maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(1U), BitMaskSize) + * sizeof(BitMask)); + CHECK( + PageInterpretation::maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(32U), BitMaskSize) + * sizeof(BitMask)); + CHECK( + PageInterpretation::maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(16U), BitMaskSize) + * sizeof(BitMask)); + CHECK( + PageInterpretation::maxBitFieldSize() + == mallocMC::ceilingDivision(PageInterpretation::numChunks(17U), BitMaskSize) + * sizeof(BitMask)); + } + + SECTION("reports numChunks that fit the page.") + { + CHECK( + page.numChunks() * chunkSize + + static_cast(mallocMC::ceilingDivision(page.numChunks(), BitMaskSize) * sizeof(BitMask)) + <= pageSize); + } + + SECTION("knows correct bit field size.") + { + uint32_t const numChunks = GENERATE(2, BitMaskSize - 1, BitMaskSize, 2 * BitMaskSize); + uint32_t localChunkSize = pageSize / numChunks; + PageInterpretation localPage{data, localChunkSize}; + CHECK(localPage.bitFieldSize() == sizeof(BitMask) * mallocMC::ceilingDivision(numChunks, BitMaskSize)); + } +} + +TEST_CASE("PageInterpretation.create") +{ + // Such that we can fit up to four levels of hierarchy in there: + constexpr uint32_t const pageSize + = BitMaskSize * BitMaskSize * BitMaskSize + static_cast(BitMaskSize * sizeof(BitMask)); + // This might be a lot of memory up to a typical stack's size. Let's save us some trouble and create it on the + // heap. + auto actualData = std::make_unique>(); + DataPage& data{*actualData}; + + uint32_t numChunks = GENERATE(BitMaskSize, BitMaskSize * BitMaskSize); + // CAUTION: Only works for full bit masks: + uint32_t chunkSize = (pageSize - (numChunks / BitMaskSize) * sizeof(BitMask)) / numChunks; + PageInterpretation page{data, chunkSize}; + + SECTION("returns a pointer to within the data.") + { + auto* pointer = page.create(accSerial); + CHECK( + std::distance(reinterpret_cast(page.chunkPointer(0)), reinterpret_cast(pointer)) + < std::distance( + reinterpret_cast(page.chunkPointer(0)), + reinterpret_cast(page.bitFieldStart()))); + } + + SECTION("returns a pointer to the start of a chunk.") + { + auto* pointer = page.create(accSerial); + CHECK( + std::distance(reinterpret_cast(page.chunkPointer(0)), reinterpret_cast(pointer)) % chunkSize + == 0U); + } + + SECTION("returns nullptr if everything is full.") + { + for(auto& mask : page.bitField()) + { + mask.set(accSerial); + } + auto* pointer = page.create(accSerial); + CHECK(pointer == nullptr); + } + + SECTION("can provide numChunks pieces of memory and returns nullptr afterwards.") + { + for(uint32_t i = 0; i < page.numChunks(); ++i) + { + auto* pointer = page.create(accSerial); + CHECK(pointer != nullptr); + } + auto* pointer = page.create(accSerial); + CHECK(pointer == nullptr); + } + + SECTION("updates bit field.") + { + BitMask& mask{page.bitField().getMask(0)}; + REQUIRE(mask.none()); + auto* pointer = page.create(accSerial); + auto const index = page.chunkNumberOf(pointer); + CHECK(mask(accSerial, index)); + } +} + +TEST_CASE("PageInterpretation.destroy") +{ + // Such that we can fit up to four levels of hierarchy in there: + constexpr uint32_t const pageSize + = BitMaskSize * BitMaskSize * BitMaskSize * BitMaskSize + + BitMaskSize * BitMaskSize * BitMaskSize * static_cast(sizeof(BitMask)); + // This is more than 8MB which is a typical stack's size. Let's save us some trouble and create it on the heap. + std::unique_ptr> actualData{new DataPage}; + DataPage& data{*actualData}; + + uint32_t numChunks = GENERATE(BitMaskSize * BitMaskSize, BitMaskSize); + uint32_t chunkSize = pageSize / numChunks; + PageInterpretation page{data, chunkSize}; + auto* pointer = page.create(accSerial); + +#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP) + SECTION("throws if given an invalid pointer.") + { + pointer = nullptr; + CHECK_THROWS( + page.destroy(accSerial, pointer), + throw std::runtime_error{"Attempted to destroy an invalid pointer! Either the pointer does not point " + "to a valid chunk or it is not marked as allocated."}); + } + + SECTION("allows pointers to anywhere in the chunk.") + { + // This test documents the state as is. We haven't defined this outcome as a requirement but if we change + // it, we might still want to be aware of this because users might want to be informed. + pointer = reinterpret_cast(reinterpret_cast(pointer) + chunkSize / 2); + CHECK_NOTHROW(page.destroy(accSerial, pointer)); + } +#endif // NDEBUG + + SECTION("only ever unsets (and never sets) bits in top-level bit mask.") + { + // We extract the position of the mask before destroying the pointer because technically speaking the whole + // concept of a mask doesn't apply anymore after that pointer was destroyed because that will automatically + // free the page. + auto mask = page.bitField().getMask(0); + auto value = mask; + page.destroy(accSerial, pointer); + CHECK(mask <= value); + } + + + SECTION("cleans up in bit field region of page") + { + // This is larger than any thread would be allowed to write. Threads would only write in the region up to + // `page.numChunks() * chunkSize` not up until `pageSize`. We still do that to have a better overview over + // what was actually deleted. + memset(std::begin(data.data), std::numeric_limits::max(), pageSize); + + uint32_t maxBitFieldSize = 0U; + uint32_t uncleanedSize = 0U; + SECTION("without explicit minimal chunk size") + { + maxBitFieldSize = page.maxBitFieldSize(); // NOLINT(*static*) + + SECTION("fully.") + { + uncleanedSize = 0U; + page.cleanupFull(); + } + + SECTION("only unused.") + { + uncleanedSize = page.bitFieldSize(); + page.cleanupUnused(); + } + } + + SECTION("with explicit minimal chunk size") + { + auto* localPage = reinterpret_cast*>(&page); // NOLINT(*magic-number*) + maxBitFieldSize = localPage->maxBitFieldSize(); // NOLINT(*static*) + + SECTION("fully.") + { + uncleanedSize = 0U; + localPage->cleanupFull(); + } + + SECTION("only unused.") + { + uncleanedSize = localPage->bitFieldSize(); + localPage->cleanupUnused(); + } + } + + for(uint32_t i = 0; i < pageSize; ++i) + { + CHECK( + data.data[i] + == ((i < pageSize - maxBitFieldSize) or (i >= pageSize - uncleanedSize) + ? std::numeric_limits::max() + : 0)); + } + } +} + +// NOLINTEND(*widening*) diff --git a/tests/unit/PageTable.cpp b/tests/unit/PageTable.cpp new file mode 100644 index 00000000..b0ea806c --- /dev/null +++ b/tests/unit/PageTable.cpp @@ -0,0 +1,54 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" + +#include + +using mallocMC::CreationPolicies::FlatterScatterAlloc::PageTable; + +constexpr uint32_t const numPages = 3; + +TEST_CASE("PageTable") +{ + PageTable pageTable{}; + + SECTION("initialises chunk sizes to 0.") + { + for(auto const& chunkSize : pageTable.chunkSizes) + { + CHECK(chunkSize == 0U); + } + } + + SECTION("initialises filling levels to 0.") + { + for(auto const& fillingLevel : pageTable.fillingLevels) + { + CHECK(fillingLevel == 0U); + } + } +} diff --git a/tests/unit/mocks.hpp b/tests/unit/mocks.hpp new file mode 100644 index 00000000..b1764d13 --- /dev/null +++ b/tests/unit/mocks.hpp @@ -0,0 +1,76 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Johannes Lenz + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// This is very hacky: AccCpuSerial (and in general all Accellerators) are very reluctant to be instantiated, so we do +// it the oldschool way and simply malloc some memory pretending to be that accellerator. Let's hope that null-ing it +// is a valid initialisation. The final class only has one mutable data member, so that's probably not half bad but I +// didn't go through all those hundreds of base classes. Usually, we only need the time anyways. +inline auto constructAcc() +{ + using Acc = alpaka::AccCpuSerial, size_t>; + void* myPointer = malloc(sizeof(Acc)); + memset(myPointer, 0U, sizeof(Acc)); + return static_cast(myPointer); +} + +// +static inline auto const accPointer = constructAcc(); +static inline auto const& accSerial = *accPointer; + +template +struct HeapConfig +{ + static constexpr auto const accessblocksize = T_blockSize; + static constexpr auto const pagesize = T_pageSize; + static constexpr auto const wastefactor = T_wasteFactor; + static constexpr auto const resetfreedpages = T_resetfreedpages; + + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr static auto isInAllowedRange( + auto const& /*acc*/, + uint32_t const chunkSize, + uint32_t const numBytes) + { + return (chunkSize >= numBytes && chunkSize <= T_wasteFactor * numBytes); + } +}; + +struct AlignmentPolicy +{ + struct Properties + { + static constexpr uint32_t const dataAlignment = 1U; + }; +}; diff --git a/tests/verify_heap.cpp b/tests/verify_heap.cpp deleted file mode 100644 index 7d696a10..00000000 --- a/tests/verify_heap.cpp +++ /dev/null @@ -1,734 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -// each pointer in the datastructure will point to this many -// elements of type allocElem_t -constexpr auto ELEMS_PER_SLOT = 750; - -#include "verify_heap_config.hpp" - -#include - -#include - -#include -#include -#include -#include -#include -#include - -using Device = alpaka::Dev; -using Queue = alpaka::Queue; - -// global variable for verbosity, might change due to user input '--verbose' -bool verbose = false; - -// the type of the elements to allocate -using allocElem_t = unsigned long long; - -auto run_heap_verification(size_t const, unsigned const, unsigned, bool const) -> bool; -void parse_cmdline(int const, char**, size_t*, unsigned*, unsigned*, bool*); -void print_help(char**); - -// used to create an empty stream for non-verbose output -struct nullstream : std::ostream -{ - nullstream() : std::ostream(0) - { - } -}; - -// uses global verbosity to switch between std::cout and a nullptr-output -auto dout() -> std::ostream& -{ - static nullstream n; - return verbose ? std::cout : n; -} - -// define some defaults -static constexpr unsigned threads_default = 128; -static constexpr unsigned blocks_default = 64; -static constexpr size_t heapInMB_default = 1024; // 1GB - -/** - * will do a basic verification of scatterAlloc. - * - * @param argv if -q or --quiet is supplied as a - * command line argument, verbosity will be reduced - * - * @return will return 0 if the verification was successful, - * otherwise returns 1 - */ -auto main(int argc, char** argv) -> int -{ - bool machine_readable = false; - size_t heapInMB = heapInMB_default; - unsigned threads = threads_default; - unsigned blocks = blocks_default; - - parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable); - - auto const correct = run_heap_verification(heapInMB, threads, blocks, machine_readable); - if(!machine_readable || verbose) - { - if(correct) - { - std::cout << "\033[0;32mverification successful \033[0m\n"; - return 0; - } - else - { - std::cerr << "\033[0;31mverification failed\033[0m\n"; - return 1; - } - } -} - -/** - * will parse command line arguments - * - * for more details, see print_help() - * - * @param argc argc from main() - * @param argv argv from main() - * @param heapInMP will be filled with the heapsize, if given as a parameter - * @param threads will be filled with number of threads, if given as a parameter - * @param blocks will be filled with number of blocks, if given as a parameter - */ -void parse_cmdline( - int const argc, - char** argv, - size_t* heapInMB, - unsigned* threads, - unsigned* blocks, - bool* machine_readable) -{ - std::vector> parameters; - - // Parse Commandline, tokens are shaped like ARG=PARAM or ARG - // This requires to use '=', if you want to supply a value with a parameter - for(int i = 1; i < argc; ++i) - { - char* pos = strtok(argv[i], "="); - std::pair p(std::string(pos), std::string("")); - pos = strtok(nullptr, "="); - if(pos != nullptr) - { - p.second = std::string(pos); - } - parameters.push_back(p); - } - - // go through all parameters that were found - for(unsigned i = 0; i < parameters.size(); ++i) - { - std::pair p = parameters.at(i); - - if(p.first == "-v" || p.first == "--verbose") - { - verbose = true; - } - - if(p.first == "--threads") - { - *threads = atoi(p.second.c_str()); - } - - if(p.first == "--blocks") - { - *blocks = atoi(p.second.c_str()); - } - - if(p.first == "--heapsize") - { - *heapInMB = size_t(atoi(p.second.c_str())); - } - - if(p.first == "-h" || p.first == "--help") - { - print_help(argv); - exit(0); - } - - if(p.first == "-m" || p.first == "--machine_readable") - { - *machine_readable = true; - } - } -} - -/** - * prints a helpful message about program use - * - * @param argv the argv-parameter from main, used to find the program name - */ -void print_help(char** argv) -{ - std::stringstream s; - - s << "SYNOPSIS:" << '\n'; - s << argv[0] << " [OPTIONS]" << '\n'; - s << "" << '\n'; - s << "OPTIONS:" << '\n'; - s << " -h, --help" << '\n'; - s << " Print this help message and exit" << '\n'; - s << "" << '\n'; - s << " -v, --verbose" << '\n'; - s << " Print information about parameters and progress" << '\n'; - s << "" << '\n'; - s << " -m, --machine_readable" << '\n'; - s << " Print all relevant parameters as CSV. This will" << '\n'; - s << " suppress all other output unless explicitly" << '\n'; - s << " requested with --verbose or -v" << '\n'; - s << "" << '\n'; - s << " --threads=N" << '\n'; - s << " Set the number of threads per block (default "; - s << threads_default << "128)" << '\n'; - s << "" << '\n'; - s << " --blocks=N" << '\n'; - s << " Set the number of blocks in the grid (default "; - s << blocks_default << ")" << '\n'; - s << "" << '\n'; - s << " --heapsize=N" << '\n'; - s << " Set the heapsize to N Megabyte (default "; - s << heapInMB_default << "1024)" << '\n'; - - std::cout << s.str() << std::flush; -} - -/** - * checks validity of memory for each single cell - * - * checks on a per thread basis, if the values written during - * allocation are still the same. Also calculates the sum over - * all allocated values for a more in-depth verification that - * could be done on the host - * - * @param data the data to verify - * @param counter should be initialized with 0 and will - * be used to count how many verifications were - * already done - * @param globalSum will be filled with the sum over all - * allocated values in the structure - * @param nSlots the size of the datastructure - * @param correct should be initialized with 1. - * Will change to 0, if there was a value that didn't match - */ -struct Check_content -{ - ALPAKA_FN_ACC void operator()( - Acc const& acc, - allocElem_t** data, - unsigned long long* counter, - unsigned long long* globalSum, - size_t const nSlots, - int* correct) const - { - unsigned long long sum = 0; - while(true) - { - size_t const pos = alpaka::atomicOp(acc, counter, 1ull); - if(pos >= nSlots) - { - break; - } - size_t const offset = pos * ELEMS_PER_SLOT; - for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) - { - if(static_cast(data[pos][i]) != static_cast(offset + i)) - { - // printf("\nError in Kernel: data[%llu][%llu] is %#010x - // (should be %#010x)\n", - // pos,i,static_cast(data[pos][i]),allocElem_t(offset+i)); - alpaka::atomicOp(acc, correct, 0); - } - sum += static_cast(data[pos][i]); - } - } - alpaka::atomicOp(acc, globalSum, sum); - } -}; - -/** - * checks validity of memory for each single cell - * - * checks on a per thread basis, if the values written during - * allocation are still the same. - * - * @param data the data to verify - * @param counter should be initialized with 0 and will - * be used to count how many verifications were - * already done - * @param nSlots the size of the datastructure - * @param correct should be initialized with 1. - * Will change to 0, if there was a value that didn't match - */ -struct Check_content_fast -{ - ALPAKA_FN_ACC void operator()( - Acc const& acc, - allocElem_t** data, - unsigned long long* counter, - size_t const nSlots, - int* correct) const - { - int c = 1; - while(true) - { - size_t pos = alpaka::atomicOp(acc, counter, 1ull); - if(pos >= nSlots) - { - break; - } - size_t const offset = pos * ELEMS_PER_SLOT; - for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) - { - if(static_cast(data[pos][i]) != static_cast(offset + i)) - { - c = 0; - } - } - } - alpaka::atomicOp(acc, correct, c); - } -}; - -/** - * allocate a lot of small arrays and fill them - * - * Each array has the size ELEMS_PER_SLOT and the type allocElem_t. - * Each element will be filled with a number that is related to its - * position in the datastructure. - * - * @param data the datastructure to allocate - * @param counter should be initialized with 0 and will - * hold, how many allocations were done - * @param globalSum will hold the sum of all values over all - * allocated structures (for verification purposes) - */ -struct AllocAll -{ - ALPAKA_FN_ACC void operator()( - Acc const& acc, - allocElem_t** data, - unsigned long long* counter, - unsigned long long* globalSum, - ScatterAllocator::AllocatorHandle mMC) const - { - unsigned long long sum = 0; - while(true) - { - allocElem_t* p = (allocElem_t*) mMC.malloc(acc, sizeof(allocElem_t) * ELEMS_PER_SLOT); - if(p == nullptr) - break; - - size_t pos = alpaka::atomicOp(acc, counter, 1ull); - size_t const offset = pos * ELEMS_PER_SLOT; - for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) - { - p[i] = static_cast(offset + i); - sum += static_cast(p[i]); - } - data[pos] = p; - } - - alpaka::atomicOp(acc, globalSum, sum); - } -}; - -/** - * free all the values again - * - * @param data the datastructure to free - * @param counter should be an empty space on device memory, - * counts how many elements were freed - * @param max the maximum number of elements to free - */ -struct DeallocAll -{ - ALPAKA_FN_ACC void operator()( - Acc const& acc, - allocElem_t** data, - unsigned long long* counter, - size_t const nSlots, - ScatterAllocator::AllocatorHandle mMC) const - { - while(true) - { - size_t pos = alpaka::atomicOp(acc, counter, 1ull); - if(pos >= nSlots) - break; - mMC.free(acc, data[pos]); - } - } -}; - -/** - * damages one element in the data - * - * With help of this function, you can verify that - * the checks actually work as expected and can find - * an error, if one should exist - * - * @param data the datastructure to damage - */ -struct DamageElement -{ - ALPAKA_FN_ACC void operator()(Acc const& acc, allocElem_t** data) const - { - data[1][0] = static_cast(5 * ELEMS_PER_SLOT - 1); - } -}; - -/** - * wrapper function to allocate memory on device - * - * allocates memory with mallocMC. Returns the number of - * created elements as well as the sum of these elements - * - * @param d_testData the datastructure which will hold - * pointers to the created elements - * @param h_nSlots will be filled with the number of elements - * that were allocated - * @param h_sum will be filled with the sum of all elements created - * @param blocks the size of the CUDA grid - * @param threads the number of CUDA threads per block - */ -void allocate( - Device const& dev, - Queue& queue, - alpaka::Buf& d_testData, - unsigned long long* nSlots, - unsigned long long* sum, - unsigned const blocks, - unsigned const threads, - ScatterAllocator& mMC) -{ - dout() << "allocating on device..."; - - auto d_sum = alpaka::allocBuf(dev, Idx{1}); - auto d_nSlots = alpaka::allocBuf(dev, Idx{1}); - - alpaka::memset(queue, d_sum, 0, 1); - alpaka::memset(queue, d_nSlots, 0, 1); - - auto const workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv, - AllocAll{}, - alpaka::getPtrNative(d_testData), - alpaka::getPtrNative(d_nSlots), - alpaka::getPtrNative(d_sum), - mMC.getAllocatorHandle())); - - auto const platform = alpaka::Platform{}; - auto const hostDev = alpaka::getDevByIdx(platform, 0); - - auto h_sum = alpaka::allocBuf(hostDev, Idx{1}); - auto h_nSlots = alpaka::allocBuf(hostDev, Idx{1}); - - alpaka::memcpy(queue, h_sum, d_sum, Idx{1}); - alpaka::memcpy(queue, h_nSlots, d_nSlots, Idx{1}); - alpaka::wait(queue); - - *sum = *alpaka::getPtrNative(h_sum); - *nSlots = *alpaka::getPtrNative(h_nSlots); - - dout() << "done\n"; -} - -/** - * Wrapper function to verify allocation on device - * - * Generates the same number that was written into each position of - * the datastructure during allocation and compares the values. - * - * @param d_testData the datastructure which holds - * pointers to the elements you want to verify - * @param nSlots the size of d_testData - * @param blocks the size of the CUDA grid - * @param threads the number of CUDA threads per block - * @return true if the verification was successful, false otherwise - */ -auto verify( - Device const& dev, - Queue& queue, - alpaka::Buf& d_testData, - unsigned long long const nSlots, - unsigned const blocks, - unsigned const threads) -> bool -{ - dout() << "verifying on device... "; - - auto const platform = alpaka::Platform{}; - auto const hostDev = alpaka::getDevByIdx(platform, 0); - - auto h_correct = alpaka::allocBuf(hostDev, Idx{1}); - *alpaka::getPtrNative(h_correct) = 1; - - auto d_sum = alpaka::allocBuf(dev, Idx{1}); - auto d_counter = alpaka::allocBuf(dev, Idx{1}); - auto d_correct = alpaka::allocBuf(dev, Idx{1}); - - alpaka::memset(queue, d_sum, 0, 1); - alpaka::memset(queue, d_counter, 0, 1); - alpaka::memcpy(queue, d_correct, h_correct, 1); - - // can be replaced by a call to check_content_fast, - // if the gaussian sum (see below) is not used and you - // want to be a bit faster - auto const workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv, - Check_content{}, - alpaka::getPtrNative(d_testData), - alpaka::getPtrNative(d_counter), - alpaka::getPtrNative(d_sum), - static_cast(nSlots), - alpaka::getPtrNative(d_correct))); - - alpaka::memcpy(queue, h_correct, d_correct, 1); - alpaka::wait(queue); - - auto const correct = *alpaka::getPtrNative(h_correct); - dout() << (correct ? "done\n" : "failed\n"); - return correct != 0; -} - -/** - * prints all parameters machine readable - * - * for params, see run_heap_verification-internal parameters - */ -void print_machine_readable( - unsigned const pagesize, - unsigned const accessblocks, - unsigned const regionsize, - unsigned const wastefactor, - bool const resetfreedpages, - unsigned const blocks, - unsigned const threads, - unsigned const elemsPerSlot, - size_t const allocElemSize, - size_t const heapSize, - size_t const maxSpace, - size_t const maxSlots, - unsigned long long const usedSlots, - float const allocFrac, - size_t const wasted, - bool const correct) -{ - std::string sep = ","; - std::stringstream h; - std::stringstream v; - - h << "PagesizeByte" << sep; - v << pagesize << sep; - - h << "Accessblocks" << sep; - v << accessblocks << sep; - - h << "Regionsize" << sep; - v << regionsize << sep; - - h << "Wastefactor" << sep; - v << wasted << sep; - - h << "ResetFreedPage" << sep; - v << resetfreedpages << sep; - - h << "Gridsize" << sep; - v << blocks << sep; - - h << "Blocksize" << sep; - v << threads << sep; - - h << "ELEMS_PER_SLOT" << sep; - v << elemsPerSlot << sep; - - h << "allocElemByte" << sep; - v << allocElemSize << sep; - - h << "heapsizeByte" << sep; - v << heapSize << sep; - - h << "maxSpaceByte" << sep; - v << maxSpace << sep; - - h << "maxSlots" << sep; - v << maxSlots << sep; - - h << "usedSlots" << sep; - v << usedSlots << sep; - - h << "allocFraction" << sep; - v << allocFrac << sep; - - h << "wastedBytes" << sep; - v << wasted << sep; - - h << "correct"; - v << correct; - - std::cout << h.str() << '\n'; - std::cout << v.str() << '\n'; -} - -/** - * Verify the heap allocation of mallocMC - * - * Allocates as much memory as the heap allows. Make sure that allocated - * memory actually holds the correct values without corrupting them. Will - * fill the datastructure with values that are relative to the index and - * later evalute, if the values inside stayed the same after allocating all - * memory. - * Datastructure: Array that holds up to nPointers pointers to arrays of size - * ELEMS_PER_SLOT, each being of type allocElem_t. - * - * @return true if the verification was successful, - * false otherwise - */ -auto run_heap_verification(size_t const heapMB, unsigned const blocks, unsigned threads, bool const machine_readable) - -> bool -{ - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto queue = Queue{dev}; - - auto const devProps = alpaka::getAccDevProps(dev); - threads = std::min(static_cast(threads), static_cast(devProps.m_blockThreadCountMax)); - - size_t const heapSize = size_t(1024U * 1024U) * heapMB; - size_t const slotSize = sizeof(allocElem_t) * ELEMS_PER_SLOT; - size_t const nPointers = (heapSize + slotSize - 1) / slotSize; - size_t const maxSlots = heapSize / slotSize; - size_t const maxSpace = maxSlots * slotSize + nPointers * sizeof(allocElem_t*); - bool correct = true; - - dout() << "CreationPolicy Arguments:\n"; - dout() << "Pagesize: " << ScatterConfig::pagesize << '\n'; - dout() << "Accessblocksize " << ScatterConfig::accessblocksize << '\n'; - dout() << "Regionsize: " << ScatterConfig::regionsize << '\n'; - dout() << "Wastefactor: " << ScatterConfig::wastefactor << '\n'; - dout() << "ResetFreedPages " << ScatterConfig::resetfreedpages << '\n'; - dout() << "\n"; - dout() << "Gridsize: " << blocks << '\n'; - dout() << "Blocksize: " << threads << '\n'; - dout() << "Allocated elements: " << ELEMS_PER_SLOT << " x " << sizeof(allocElem_t); - dout() << " Byte (" << slotSize << " Byte)\n"; - dout() << "Heap: " << heapSize << " Byte"; - dout() << " (" << heapSize / pow(1024, 2) << " MByte)\n"; - dout() << "max space w/ pointers: " << maxSpace << " Byte"; - dout() << " (" << maxSpace / pow(1024, 2) << " MByte)\n"; - dout() << "maximum of elements: " << maxSlots << '\n'; - - unsigned long long usedSlots = 0; - unsigned long long sumAllocElems = 0; - float allocFrac = 0; - size_t wasted = 0; - - { - ScatterAllocator mMC(dev, queue, heapSize); - - // allocating with mallocMC - auto d_testData = alpaka::allocBuf(dev, Idx{nPointers}); - allocate(dev, queue, d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC); - - allocFrac = static_cast(usedSlots) * 100 / maxSlots; - wasted = heapSize - static_cast(usedSlots) * slotSize; - dout() << "allocated elements: " << usedSlots; - dout() << " (" << allocFrac << "%)\n"; - dout() << "wasted heap space: " << wasted << " Byte"; - dout() << " (" << wasted / pow(1024, 2) << " MByte)\n"; - - // verifying on device - correct = correct && verify(dev, queue, d_testData, usedSlots, blocks, threads); - - // damaging one cell - dout() << "damaging of element... "; - { - auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, DamageElement{}, alpaka::getPtrNative(d_testData))); - } - dout() << "done\n"; - - // verifying on device - // THIS SHOULD FAIL (damage was done before!). Therefore, we must - // inverse the logic - correct = correct && !verify(dev, queue, d_testData, usedSlots, blocks, threads); - - // release all memory - dout() << "deallocation... "; - auto d_dealloc_counter = alpaka::allocBuf(dev, Idx{1}); - alpaka::memset(queue, d_dealloc_counter, 0, 1); - { - auto const workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv, - DeallocAll{}, - alpaka::getPtrNative(d_testData), - alpaka::getPtrNative(d_dealloc_counter), - static_cast(usedSlots), - mMC.getAllocatorHandle())); - } - } - - dout() << "done \n"; - - if(machine_readable) - { - print_machine_readable( - ScatterConfig::pagesize, - ScatterConfig::accessblocksize, - ScatterConfig::regionsize, - ScatterConfig::wastefactor, - ScatterConfig::resetfreedpages, - blocks, - threads, - ELEMS_PER_SLOT, - sizeof(allocElem_t), - heapSize, - maxSpace, - maxSlots, - usedSlots, - allocFrac, - wasted, - correct); - } - - return correct; -} diff --git a/tests/verify_heap_config.hpp b/tests/verify_heap_config.hpp deleted file mode 100644 index 1ca2b8bb..00000000 --- a/tests/verify_heap_config.hpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - mallocMC: Memory Allocator for Many Core Architectures. - https://www.hzdr.de/crp - - Copyright 2014 Institute of Radiation Physics, - Helmholtz-Zentrum Dresden - Rossendorf - - Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#pragma once - -#include -#include - -#include - -using Dim = alpaka::DimInt<1>; -using Idx = std::size_t; - -// Define the device accelerator -using Acc = alpaka::ExampleDefaultAcc; - -// configurate the CreationPolicy "Scatter" -struct ScatterConfig -{ - static constexpr auto pagesize = 4096; - static constexpr auto accessblocksize = 2u * 1024u * 1024u * 1024u; - static constexpr auto regionsize = 16; - static constexpr auto wastefactor = 2; - static constexpr auto resetfreedpages = false; -}; - -struct ScatterHashParams -{ - static constexpr auto hashingK = 38183; - static constexpr auto hashingDistMP = 17497; - static constexpr auto hashingDistWP = 1; - static constexpr auto hashingDistWPRel = 1; -}; - -// configure the DistributionPolicy "XMallocSIMD" -struct DistributionConfig -{ - static constexpr auto pagesize = ScatterConfig::pagesize; -}; - -// configure the AlignmentPolicy "Shrink" -struct AlignmentConfig -{ - static constexpr auto dataAlignment = 16; -}; - -// Define a new allocator and call it ScatterAllocator -// which resembles the behaviour of ScatterAlloc -using ScatterAllocator = mallocMC::Allocator< - Acc, - mallocMC::CreationPolicies::Scatter, - mallocMC::DistributionPolicies::Noop, - mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; From 10221ebdedbd7860f90e34620308b73b0a540c34 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Tue, 12 Nov 2024 15:00:10 +0100 Subject: [PATCH 11/16] Add catch2 as submodule --- .gitmodules | 3 + thirdParty/catch2 | 1 + thirdParty/catch2/include/catch2/catch.hpp | 17966 ------------------- 3 files changed, 4 insertions(+), 17966 deletions(-) create mode 100644 .gitmodules create mode 160000 thirdParty/catch2 delete mode 100644 thirdParty/catch2/include/catch2/catch.hpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..8739c483 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "thirdParty/catch2"] + path = thirdParty/catch2 + url = https://github.com/catchorg/catch2 diff --git a/thirdParty/catch2 b/thirdParty/catch2 new file mode 160000 index 00000000..fa43b774 --- /dev/null +++ b/thirdParty/catch2 @@ -0,0 +1 @@ +Subproject commit fa43b77429ba76c462b1898d6cd2f2d7a9416b14 diff --git a/thirdParty/catch2/include/catch2/catch.hpp b/thirdParty/catch2/include/catch2/catch.hpp deleted file mode 100644 index db1fed3b..00000000 --- a/thirdParty/catch2/include/catch2/catch.hpp +++ /dev/null @@ -1,17966 +0,0 @@ -/* - * Catch v2.13.8 - * Generated: 2022-01-03 21:20:09.589503 - * ---------------------------------------------------------- - * This file has been merged from multiple headers. Please don't edit it directly - * Copyright (c) 2022 Two Blue Cubes Ltd. All rights reserved. - * - * Distributed under the Boost Software License, Version 1.0. (See accompanying - * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - */ -#ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED -#define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED -// start catch.hpp - - -#define CATCH_VERSION_MAJOR 2 -#define CATCH_VERSION_MINOR 13 -#define CATCH_VERSION_PATCH 8 - -#ifdef __clang__ -# pragma clang system_header -#elif defined __GNUC__ -# pragma GCC system_header -#endif - -// start catch_suppress_warnings.h - -#ifdef __clang__ -# ifdef __ICC // icpc defines the __clang__ macro -# pragma warning(push) -# pragma warning(disable: 161 1682) -# else // __ICC -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wpadded" -# pragma clang diagnostic ignored "-Wswitch-enum" -# pragma clang diagnostic ignored "-Wcovered-switch-default" -# endif -#elif defined __GNUC__ - // Because REQUIREs trigger GCC's -Wparentheses, and because still - // supported version of g++ have only buggy support for _Pragmas, - // Wparentheses have to be suppressed globally. -# pragma GCC diagnostic ignored "-Wparentheses" // See #674 for details - -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wunused-variable" -# pragma GCC diagnostic ignored "-Wpadded" -#endif -// end catch_suppress_warnings.h -#if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER) -# define CATCH_IMPL -# define CATCH_CONFIG_ALL_PARTS -#endif - -// In the impl file, we want to have access to all parts of the headers -// Can also be used to sanely support PCHs -#if defined(CATCH_CONFIG_ALL_PARTS) -# define CATCH_CONFIG_EXTERNAL_INTERFACES -# if defined(CATCH_CONFIG_DISABLE_MATCHERS) -# undef CATCH_CONFIG_DISABLE_MATCHERS -# endif -# if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER) -# define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER -# endif -#endif - -#if !defined(CATCH_CONFIG_IMPL_ONLY) -// start catch_platform.h - -// See e.g.: -// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html -#ifdef __APPLE__ -# include -# if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \ - (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1) -# define CATCH_PLATFORM_MAC -# elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1) -# define CATCH_PLATFORM_IPHONE -# endif - -#elif defined(linux) || defined(__linux) || defined(__linux__) -# define CATCH_PLATFORM_LINUX - -#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__) -# define CATCH_PLATFORM_WINDOWS -#endif - -// end catch_platform.h - -#ifdef CATCH_IMPL -# ifndef CLARA_CONFIG_MAIN -# define CLARA_CONFIG_MAIN_NOT_DEFINED -# define CLARA_CONFIG_MAIN -# endif -#endif - -// start catch_user_interfaces.h - -namespace Catch { - unsigned int rngSeed(); -} - -// end catch_user_interfaces.h -// start catch_tag_alias_autoregistrar.h - -// start catch_common.h - -// start catch_compiler_capabilities.h - -// Detect a number of compiler features - by compiler -// The following features are defined: -// -// CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported? -// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported? -// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported? -// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled? -// **************** -// Note to maintainers: if new toggles are added please document them -// in configuration.md, too -// **************** - -// In general each macro has a _NO_ form -// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature. -// Many features, at point of detection, define an _INTERNAL_ macro, so they -// can be combined, en-mass, with the _NO_ forms later. - -#ifdef __cplusplus - -# if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) -# define CATCH_CPP14_OR_GREATER -# endif - -# if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) -# define CATCH_CPP17_OR_GREATER -# endif - -#endif - -// Only GCC compiler should be used in this block, so other compilers trying to -// mask themselves as GCC should be ignored. -#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) -# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" ) -# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic pop" ) - -# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) - -#endif - -#if defined(__clang__) - -# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" ) -# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic pop" ) - -// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug -// which results in calls to destructors being emitted for each temporary, -// without a matching initialization. In practice, this can result in something -// like `std::string::~string` being called on an uninitialized value. -// -// For example, this code will likely segfault under IBM XL: -// ``` -// REQUIRE(std::string("12") + "34" == "1234") -// ``` -// -// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented. -# if !defined(__ibmxl__) && !defined(__CUDACC__) -# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */ -# endif - -# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ - _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ - _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") - -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) - -# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ - _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) - -# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ - _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) - -# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ - _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) - -#endif // __clang__ - -//////////////////////////////////////////////////////////////////////////////// -// Assume that non-Windows platforms support posix signals by default -#if !defined(CATCH_PLATFORM_WINDOWS) - #define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS -#endif - -//////////////////////////////////////////////////////////////////////////////// -// We know some environments not to support full POSIX signals -#if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) || defined(__DJGPP__) - #define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS -#endif - -#ifdef __OS400__ -# define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS -# define CATCH_CONFIG_COLOUR_NONE -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Android somehow still does not support std::to_string -#if defined(__ANDROID__) -# define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING -# define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Not all Windows environments support SEH properly -#if defined(__MINGW32__) -# define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH -#endif - -//////////////////////////////////////////////////////////////////////////////// -// PS4 -#if defined(__ORBIS__) -# define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Cygwin -#ifdef __CYGWIN__ - -// Required for some versions of Cygwin to declare gettimeofday -// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin -# define _BSD_SOURCE -// some versions of cygwin (most) do not support std::to_string. Use the libstd check. -// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813 -# if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \ - && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF)) - -# define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING - -# endif -#endif // __CYGWIN__ - -//////////////////////////////////////////////////////////////////////////////// -// Visual C++ -#if defined(_MSC_VER) - -// Universal Windows platform does not support SEH -// Or console colours (or console at all...) -# if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP) -# define CATCH_CONFIG_COLOUR_NONE -# else -# define CATCH_INTERNAL_CONFIG_WINDOWS_SEH -# endif - -# if !defined(__clang__) // Handle Clang masquerading for msvc - -// MSVC traditional preprocessor needs some workaround for __VA_ARGS__ -// _MSVC_TRADITIONAL == 0 means new conformant preprocessor -// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor -# if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) -# define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -# endif // MSVC_TRADITIONAL - -// Only do this if we're not using clang on Windows, which uses `diagnostic push` & `diagnostic pop` -# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) ) -# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma( warning(pop) ) -# endif // __clang__ - -#endif // _MSC_VER - -#if defined(_REENTRANT) || defined(_MSC_VER) -// Enable async processing, as -pthread is specified or no additional linking is required -# define CATCH_INTERNAL_CONFIG_USE_ASYNC -#endif // _MSC_VER - -//////////////////////////////////////////////////////////////////////////////// -// Check if we are compiled with -fno-exceptions or equivalent -#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND) -# define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED -#endif - -//////////////////////////////////////////////////////////////////////////////// -// DJGPP -#ifdef __DJGPP__ -# define CATCH_INTERNAL_CONFIG_NO_WCHAR -#endif // __DJGPP__ - -//////////////////////////////////////////////////////////////////////////////// -// Embarcadero C++Build -#if defined(__BORLANDC__) - #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN -#endif - -//////////////////////////////////////////////////////////////////////////////// - -// Use of __COUNTER__ is suppressed during code analysis in -// CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly -// handled by it. -// Otherwise all supported compilers support COUNTER macro, -// but user still might want to turn it off -#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L ) - #define CATCH_INTERNAL_CONFIG_COUNTER -#endif - -//////////////////////////////////////////////////////////////////////////////// - -// RTX is a special version of Windows that is real time. -// This means that it is detected as Windows, but does not provide -// the same set of capabilities as real Windows does. -#if defined(UNDER_RTSS) || defined(RTX64_BUILD) - #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH - #define CATCH_INTERNAL_CONFIG_NO_ASYNC - #define CATCH_CONFIG_COLOUR_NONE -#endif - -#if !defined(_GLIBCXX_USE_C99_MATH_TR1) -#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER -#endif - -// Various stdlib support checks that require __has_include -#if defined(__has_include) - // Check if string_view is available and usable - #if __has_include() && defined(CATCH_CPP17_OR_GREATER) - # define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW - #endif - - // Check if optional is available and usable - # if __has_include() && defined(CATCH_CPP17_OR_GREATER) - # define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL - # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) - - // Check if byte is available and usable - # if __has_include() && defined(CATCH_CPP17_OR_GREATER) - # include - # if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0) - # define CATCH_INTERNAL_CONFIG_CPP17_BYTE - # endif - # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) - - // Check if variant is available and usable - # if __has_include() && defined(CATCH_CPP17_OR_GREATER) - # if defined(__clang__) && (__clang_major__ < 8) - // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852 - // fix should be in clang 8, workaround in libstdc++ 8.2 - # include - # if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) - # define CATCH_CONFIG_NO_CPP17_VARIANT - # else - # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT - # endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) - # else - # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT - # endif // defined(__clang__) && (__clang_major__ < 8) - # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) -#endif // defined(__has_include) - -#if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER) -# define CATCH_CONFIG_COUNTER -#endif -#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH) -# define CATCH_CONFIG_WINDOWS_SEH -#endif -// This is set by default, because we assume that unix compilers are posix-signal-compatible by default. -#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS) -# define CATCH_CONFIG_POSIX_SIGNALS -#endif -// This is set by default, because we assume that compilers with no wchar_t support are just rare exceptions. -#if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR) -# define CATCH_CONFIG_WCHAR -#endif - -#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING) -# define CATCH_CONFIG_CPP11_TO_STRING -#endif - -#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL) -# define CATCH_CONFIG_CPP17_OPTIONAL -#endif - -#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW) -# define CATCH_CONFIG_CPP17_STRING_VIEW -#endif - -#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT) -# define CATCH_CONFIG_CPP17_VARIANT -#endif - -#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE) -# define CATCH_CONFIG_CPP17_BYTE -#endif - -#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT) -# define CATCH_INTERNAL_CONFIG_NEW_CAPTURE -#endif - -#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE) -# define CATCH_CONFIG_NEW_CAPTURE -#endif - -#if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) -# define CATCH_CONFIG_DISABLE_EXCEPTIONS -#endif - -#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN) -# define CATCH_CONFIG_POLYFILL_ISNAN -#endif - -#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC) && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC) -# define CATCH_CONFIG_USE_ASYNC -#endif - -#if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_ANDROID_LOGWRITE) -# define CATCH_CONFIG_ANDROID_LOGWRITE -#endif - -#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER) -# define CATCH_CONFIG_GLOBAL_NEXTAFTER -#endif - -// Even if we do not think the compiler has that warning, we still have -// to provide a macro that can be used by the code. -#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION) -# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION -#endif -#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION) -# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION -#endif -#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS -#endif -#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS) -# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS -#endif -#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS) -# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS -#endif -#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS) -# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS -#endif - -// The goal of this macro is to avoid evaluation of the arguments, but -// still have the compiler warn on problems inside... -#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN) -# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) -#endif - -#if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10) -# undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -#elif defined(__clang__) && (__clang_major__ < 5) -# undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -#endif - -#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS) -# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -#endif - -#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) -#define CATCH_TRY if ((true)) -#define CATCH_CATCH_ALL if ((false)) -#define CATCH_CATCH_ANON(type) if ((false)) -#else -#define CATCH_TRY try -#define CATCH_CATCH_ALL catch (...) -#define CATCH_CATCH_ANON(type) catch (type) -#endif - -#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) -#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -#endif - -// end catch_compiler_capabilities.h -#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line -#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) -#ifdef CATCH_CONFIG_COUNTER -# define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ ) -#else -# define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ ) -#endif - -#include -#include -#include - -// We need a dummy global operator<< so we can bring it into Catch namespace later -struct Catch_global_namespace_dummy {}; -std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy); - -namespace Catch { - - struct CaseSensitive { enum Choice { - Yes, - No - }; }; - - class NonCopyable { - NonCopyable( NonCopyable const& ) = delete; - NonCopyable( NonCopyable && ) = delete; - NonCopyable& operator = ( NonCopyable const& ) = delete; - NonCopyable& operator = ( NonCopyable && ) = delete; - - protected: - NonCopyable(); - virtual ~NonCopyable(); - }; - - struct SourceLineInfo { - - SourceLineInfo() = delete; - SourceLineInfo( char const* _file, std::size_t _line ) noexcept - : file( _file ), - line( _line ) - {} - - SourceLineInfo( SourceLineInfo const& other ) = default; - SourceLineInfo& operator = ( SourceLineInfo const& ) = default; - SourceLineInfo( SourceLineInfo&& ) noexcept = default; - SourceLineInfo& operator = ( SourceLineInfo&& ) noexcept = default; - - bool empty() const noexcept { return file[0] == '\0'; } - bool operator == ( SourceLineInfo const& other ) const noexcept; - bool operator < ( SourceLineInfo const& other ) const noexcept; - - char const* file; - std::size_t line; - }; - - std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ); - - // Bring in operator<< from global namespace into Catch namespace - // This is necessary because the overload of operator<< above makes - // lookup stop at namespace Catch - using ::operator<<; - - // Use this in variadic streaming macros to allow - // >> +StreamEndStop - // as well as - // >> stuff +StreamEndStop - struct StreamEndStop { - std::string operator+() const; - }; - template - T const& operator + ( T const& value, StreamEndStop ) { - return value; - } -} - -#define CATCH_INTERNAL_LINEINFO \ - ::Catch::SourceLineInfo( __FILE__, static_cast( __LINE__ ) ) - -// end catch_common.h -namespace Catch { - - struct RegistrarForTagAliases { - RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo ); - }; - -} // end namespace Catch - -#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \ - CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ - CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ - namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \ - CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION - -// end catch_tag_alias_autoregistrar.h -// start catch_test_registry.h - -// start catch_interfaces_testcase.h - -#include - -namespace Catch { - - class TestSpec; - - struct ITestInvoker { - virtual void invoke () const = 0; - virtual ~ITestInvoker(); - }; - - class TestCase; - struct IConfig; - - struct ITestCaseRegistry { - virtual ~ITestCaseRegistry(); - virtual std::vector const& getAllTests() const = 0; - virtual std::vector const& getAllTestsSorted( IConfig const& config ) const = 0; - }; - - bool isThrowSafe( TestCase const& testCase, IConfig const& config ); - bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ); - std::vector filterTests( std::vector const& testCases, TestSpec const& testSpec, IConfig const& config ); - std::vector const& getAllTestCasesSorted( IConfig const& config ); - -} - -// end catch_interfaces_testcase.h -// start catch_stringref.h - -#include -#include -#include -#include - -namespace Catch { - - /// A non-owning string class (similar to the forthcoming std::string_view) - /// Note that, because a StringRef may be a substring of another string, - /// it may not be null terminated. - class StringRef { - public: - using size_type = std::size_t; - using const_iterator = const char*; - - private: - static constexpr char const* const s_empty = ""; - - char const* m_start = s_empty; - size_type m_size = 0; - - public: // construction - constexpr StringRef() noexcept = default; - - StringRef( char const* rawChars ) noexcept; - - constexpr StringRef( char const* rawChars, size_type size ) noexcept - : m_start( rawChars ), - m_size( size ) - {} - - StringRef( std::string const& stdString ) noexcept - : m_start( stdString.c_str() ), - m_size( stdString.size() ) - {} - - explicit operator std::string() const { - return std::string(m_start, m_size); - } - - public: // operators - auto operator == ( StringRef const& other ) const noexcept -> bool; - auto operator != (StringRef const& other) const noexcept -> bool { - return !(*this == other); - } - - auto operator[] ( size_type index ) const noexcept -> char { - assert(index < m_size); - return m_start[index]; - } - - public: // named queries - constexpr auto empty() const noexcept -> bool { - return m_size == 0; - } - constexpr auto size() const noexcept -> size_type { - return m_size; - } - - // Returns the current start pointer. If the StringRef is not - // null-terminated, throws std::domain_exception - auto c_str() const -> char const*; - - public: // substrings and searches - // Returns a substring of [start, start + length). - // If start + length > size(), then the substring is [start, size()). - // If start > size(), then the substring is empty. - auto substr( size_type start, size_type length ) const noexcept -> StringRef; - - // Returns the current start pointer. May not be null-terminated. - auto data() const noexcept -> char const*; - - constexpr auto isNullTerminated() const noexcept -> bool { - return m_start[m_size] == '\0'; - } - - public: // iterators - constexpr const_iterator begin() const { return m_start; } - constexpr const_iterator end() const { return m_start + m_size; } - }; - - auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&; - auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&; - - constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { - return StringRef( rawChars, size ); - } -} // namespace Catch - -constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { - return Catch::StringRef( rawChars, size ); -} - -// end catch_stringref.h -// start catch_preprocessor.hpp - - -#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__ -#define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__))) -#define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__))) -#define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__))) -#define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__))) -#define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__))) - -#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__ -// MSVC needs more evaluations -#define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__))) -#define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__)) -#else -#define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL5(__VA_ARGS__) -#endif - -#define CATCH_REC_END(...) -#define CATCH_REC_OUT - -#define CATCH_EMPTY() -#define CATCH_DEFER(id) id CATCH_EMPTY() - -#define CATCH_REC_GET_END2() 0, CATCH_REC_END -#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2 -#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1 -#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT -#define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0) -#define CATCH_REC_NEXT(test, next) CATCH_REC_NEXT1(CATCH_REC_GET_END test, next) - -#define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) -#define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ ) -#define CATCH_REC_LIST2(f, x, peek, ...) f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) - -#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) -#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ ) -#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...) f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) - -// Applies the function macro `f` to each of the remaining parameters, inserts commas between the results, -// and passes userdata as the first parameter to each invocation, -// e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c) -#define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) - -#define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) - -#define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param) -#define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO## __VA_ARGS__ -#define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__ -#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF -#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__) -#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__ -#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) -#else -// MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF -#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__) -#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__ -#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1) -#endif - -#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__ -#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name) - -#define INTERNAL_CATCH_REMOVE_PARENS(...) INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__) - -#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper()) -#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)) -#else -#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper())) -#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))) -#endif - -#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\ - CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__) - -#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0) -#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1) -#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2) -#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3) -#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4) -#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5) -#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6) -#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7) -#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8) -#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9) -#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) - -#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N - -#define INTERNAL_CATCH_TYPE_GEN\ - template struct TypeList {};\ - template\ - constexpr auto get_wrapper() noexcept -> TypeList { return {}; }\ - template class...> struct TemplateTypeList{};\ - template class...Cs>\ - constexpr auto get_wrapper() noexcept -> TemplateTypeList { return {}; }\ - template\ - struct append;\ - template\ - struct rewrap;\ - template class, typename...>\ - struct create;\ - template class, typename>\ - struct convert;\ - \ - template \ - struct append { using type = T; };\ - template< template class L1, typename...E1, template class L2, typename...E2, typename...Rest>\ - struct append, L2, Rest...> { using type = typename append, Rest...>::type; };\ - template< template class L1, typename...E1, typename...Rest>\ - struct append, TypeList, Rest...> { using type = L1; };\ - \ - template< template class Container, template class List, typename...elems>\ - struct rewrap, List> { using type = TypeList>; };\ - template< template class Container, template class List, class...Elems, typename...Elements>\ - struct rewrap, List, Elements...> { using type = typename append>, typename rewrap, Elements...>::type>::type; };\ - \ - template