From 3415bb72e31acd19d39f6170f76298caaf9e79c0 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 18 Oct 2024 17:12:53 +0200 Subject: [PATCH] full subgroups require X dimension be a multiple of subgroup size Also trying to narrow down the DXC bug #!@#!@#@!#!@#!@#~! --- 27_MPMCScheduler/app_resources/common.hlsl | 19 +-- .../app_resources/schedulers/mpmc.hlsl | 2 +- .../app_resources/shader.comp.hlsl | 134 ++++++++++++++---- 27_MPMCScheduler/main.cpp | 37 +++-- 4 files changed, 137 insertions(+), 55 deletions(-) diff --git a/27_MPMCScheduler/app_resources/common.hlsl b/27_MPMCScheduler/app_resources/common.hlsl index 259d5069d..2fb8971ad 100644 --- a/27_MPMCScheduler/app_resources/common.hlsl +++ b/27_MPMCScheduler/app_resources/common.hlsl @@ -1,22 +1,11 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" -NBL_CONSTEXPR uint32_t WorkgroupSizeX = 16; -NBL_CONSTEXPR uint32_t WorkgroupSizeY = 16; +NBL_CONSTEXPR uint32_t WorkgroupSizeX = 8; +NBL_CONSTEXPR uint32_t WorkgroupSizeY = 8; NBL_CONSTEXPR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY; -static const uint32_t FRAMES_IN_FLIGHT = 3u; - -static const uint32_t RED_OFFSET = 0u; -static const uint32_t GREEN_OFFSET = 256u; -static const uint32_t BLUE_OFFSET = 256u * 2u; - -static const uint32_t CHANEL_CNT = 3; -static const uint32_t VAL_PER_CHANEL_CNT = 256; -static const uint32_t HISTOGRAM_SIZE = CHANEL_CNT * VAL_PER_CHANEL_CNT; -static const uint32_t HISTOGRAM_BYTE_SIZE = HISTOGRAM_SIZE * sizeof(uint32_t); -static const uint32_t COMBINED_HISTOGRAM_BUFFER_BYTE_SIZE = HISTOGRAM_BYTE_SIZE * FRAMES_IN_FLIGHT; - struct PushConstants { - uint32_t histogramBufferOffset; + uint32_t sharedAcceptableIdleCount : 10; + uint32_t globalAcceptableIdleCount : 10; }; \ No newline at end of file diff --git a/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl b/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl index 2e345bd3e..a96b0d9b3 100644 --- a/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl +++ b/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl @@ -16,7 +16,7 @@ namespace schedulers { // TODO: improve and use a Global Pool Allocator and stop moving whole payloads around in VRAM -template +template struct MPMC { // TODO: static asset that the signature of the `Task::operator()` is `void()` diff --git a/27_MPMCScheduler/app_resources/shader.comp.hlsl b/27_MPMCScheduler/app_resources/shader.comp.hlsl index a58cef874..e08542c6d 100644 --- a/27_MPMCScheduler/app_resources/shader.comp.hlsl +++ b/27_MPMCScheduler/app_resources/shader.comp.hlsl @@ -1,11 +1,19 @@ //#include "nbl/builtin/hlsl/memory_accessor.hlsl" //#include "nbl/builtin/hlsl/type_traits.hlsl" -#include "schedulers/mpmc.hlsl" +//#include "schedulers/mpmc.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" + +#include "common.hlsl" #include "nbl/builtin/hlsl/limits.hlsl" #include "nbl/builtin/hlsl/numbers.hlsl" + +using namespace nbl::hlsl; + + +#if 0 enum Material : uint32_t { Emission = 0, @@ -72,6 +80,7 @@ const static Sphere spheres[5] = { Material::Glass } }; +#endif struct WhittedTask { @@ -123,11 +132,9 @@ struct WhittedTask void operator()(); }; -NBL_REGISTER_OBJ_TYPE(WhittedTask,8); +//NBL_REGISTER_OBJ_TYPE(WhittedTask,8); -struct GlobalAccessor -{ -}; +#if 0 // something something, Nvidia can do 32 bytes of smem per invocation groupshared uint32_t sdata[512]; struct SharedAccessor @@ -148,18 +155,16 @@ struct SharedAccessor return nbl::hlsl::glsl::atomicAdd(sdata[ix],val); } - template - void set(const uint32_t ix, const in T val) + void set(const uint32_t ix, const in uint32_t val) { -// sdata[ix] = val; + sdata[ix] = val; } - template - void get(const uint32_t ix, out T val) + void get(const uint32_t ix, out uint32_t val) { -// sdata[ix] = val; + val = sdata[ix]; } }; -static nbl::hlsl::MPMCScheduler scheduler; +static nbl::hlsl::MPMCScheduler scheduler; // stolen from Nabla GLSL bool nbl_glsl_getOrientedEtas(out float orientedEta, out float rcpOrientedEta, in float NdotI, in float eta) @@ -191,11 +196,13 @@ float32_t3 nbl_glsl_refract(in float32_t3 I, in float32_t3 N, in bool backside, const float NdotT = backside ? abs_NdotT:(-abs_NdotT); return N*(NdotI*rcpOrientedEta + NdotT) - rcpOrientedEta*I; } +#endif + +[[vk::binding(0,0)]] RWTexture2D framebuffer; void WhittedTask::operator()() { - using namespace nbl::hlsl; - +#if 0 const float32_t3 rayDir = getRayDir(); const float32_t3 throughput = getThroughput(); @@ -244,7 +251,7 @@ void WhittedTask::operator()() newTask.setThroughput(isGlass ? newThroughput:(color*newThroughput)); newTask.setRayDir(reflected); - scheduler.push(newTask); +// scheduler.push(newTask); } // deal with refraction if (isGlass) @@ -253,7 +260,7 @@ void WhittedTask::operator()() newThroughput *= color; newTask.setThroughput(newThroughput); newTask.setRayDir(nbl_glsl_refract(-rayDir,normal,backside,NdotV,rcpOrientedEta)); - scheduler.push(newTask); +// scheduler.push(newTask); } } @@ -265,49 +272,122 @@ void WhittedTask::operator()() if (contribution.r+contribution.g+contribution.b<1.f/2047.f) return; +#endif // Use device traits to do CAS loops on R32_UINT view of RGB9E5 when no VK_NV_shader_atomic_float16_vector // spirv::atomicAdd(spirv::addrof(framebuffer),contribution); - framebuffer[uint32_t2(outputX,outputY)] = float32_t4(contribution,1.f); + framebuffer[uint32_t2(outputX,outputY)] = 0xffFFffFFu; } +// move to `nbl/builtin/hlsl/shared_exp_t3.hlsl` + +template +struct shared_exp_t3 +{ + using this_t = shared_exp_t3; + + UintT storage; +}; + +/* +uvec3 nbl_glsl_impl_sharedExponentEncodeCommon(in vec3 clamped, in int newExpBias, in int newMaxExp, in int mantissaBits, out int shared_exp) +{ + const float maxrgb = max(max(clamped.r, clamped.g), clamped.b); + // TODO: optimize this + const int f32_exp = int(nbl_glsl_ieee754_extract_biased_exponent(maxrgb)) - 126; + + shared_exp = clamp(f32_exp, -newExpBias, newMaxExp + 1); + + float scale = exp2(mantissaBits - shared_exp); + const uint maxm = uint(maxrgb * scale + 0.5); + const bool need = maxm == (0x1u << mantissaBits); + scale = need ? 0.5 * scale : scale; + shared_exp = need ? (shared_exp + 1) : shared_exp; + return uvec3(clamped * scale + vec3(0.5)); +} + +uvec2 nbl_glsl_encodeRGB9E5(in vec3 col) +{ + const vec3 clamped = clamp(col, vec3(0.0), vec3(nbl_glsl_MAX_RGB19E7)); + + int shared_exp; + const uvec3 mantissas = nbl_glsl_impl_sharedExponentEncodeCommon(clamped, nbl_glsl_RGB19E7_EXP_BIAS, nbl_glsl_MAX_RGB19E7_EXP, nbl_glsl_RGB19E7_MANTISSA_BITS, shared_exp); + + uvec2 encoded; + encoded.x = bitfieldInsert(mantissas.x, mantissas.y, nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[1], nbl_glsl_RGB19E7_G_COMPONENT_SPLIT); + encoded.y = bitfieldInsert( + mantissas.y >> nbl_glsl_RGB19E7_G_COMPONENT_SPLIT, + mantissas.z, + nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[2], + nbl_glsl_RGB19E7_MANTISSA_BITS) + | uint((shared_exp + nbl_glsl_RGB19E7_EXP_BIAS) << nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[3]); + + return encoded; +} +*/ + + +struct Dummy +{ + void operator()() + { + next(); + } + + WhittedTask next; + bool nextValid; +}; +static Dummy scheduler; + +[[vk::push_constant]] PushConstants pc; + +// have to do weird stuff with workgroup size because of subgroup full spec namespace nbl { namespace hlsl { namespace glsl { -uint32_t3 gl_WorkGroupSize() {return uint32_t3(8,8,1);} +uint32_t3 gl_WorkGroupSize() {return uint32_t3(WorkgroupSizeX*WorkgroupSizeY,1,1);} } } } -[numthreads(8,8,1)] -void main(uint32_t3 gl_GlobalInvocationID : SV_DispatchThreadID) +[numthreads(WorkgroupSizeX*WorkgroupSizeY,1,1)] +void main() { // manually push an explicit workload { + // reconstruct the actual XY coordinate we want + uint32_t2 GlobalInvocationID = glsl::gl_WorkGroupID().xy*glsl::gl_WorkGroupSize().xy; + // TODO: morton code + { + const uint32_t linearIx = glsl::gl_LocalInvocationIndex(); + GlobalInvocationID.x += linearIx%WorkgroupSizeX; + GlobalInvocationID.y += linearIx/WorkgroupSizeX; + } +#if 0 scheduler.next.origin = float32_t3(0,0,-5); scheduler.next.setThroughput(float32_t3(1,1,1)); - scheduler.next.outputX = gl_GlobalInvocationID.x; - scheduler.next.outputY = gl_GlobalInvocationID.y; + scheduler.next.outputX = GlobalInvocationID.x; + scheduler.next.outputY = GlobalInvocationID.y; { using namespace nbl::hlsl; float32_t3 ndc; { const float32_t2 totalInvocations = glsl::gl_NumWorkGroups().xy*8.f; - ndc.xy = (float32_t2(gl_GlobalInvocationID.xy)+float32_t2(0.5,0.5))*2.f/totalInvocations-float32_t2(1,1); + ndc.xy = (float32_t2(GlobalInvocationID.xy)+float32_t2(0.5,0.5))*2.f/totalInvocations-float32_t2(1,1); ndc.y *= totalInvocations.y/totalInvocations.x; // aspect raio } ndc.z = 1.f; // FOV of 90 degrees scheduler.next.setRayDir(normalize(ndc)); } scheduler.next.depth = 0; +#endif +// scheduler.sharedAcceptableIdleCount = 0; +// scheduler.globalAcceptableIdleCount = 0; scheduler.nextValid = true; } // excute implcit as scheduled - scheduler(); -#ifdef DEBUG - printf("Workgroup Quit"); -#endif +// scheduler(); } diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp index f2a14b452..05ca1efd8 100644 --- a/27_MPMCScheduler/main.cpp +++ b/27_MPMCScheduler/main.cpp @@ -12,6 +12,7 @@ using namespace nbl::asset; using namespace nbl::ui; using namespace nbl::video; +#include "app_resources/common.hlsl" class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -59,7 +60,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi return false; if (!asset_base_t::onAppInitialized(std::move(system))) return false; -/* + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; @@ -79,7 +80,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi if (!shader) return false; } -*/ + smart_refctd_ptr dsLayout; { const IGPUDescriptorSetLayout::SBinding bindings[1] = { { @@ -94,9 +95,14 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi if (!dsLayout) return logFail("Failed to Create Descriptor Layout"); } -/* + { - auto layout = m_device->createPipelineLayout({},smart_refctd_ptr(dsLayout)); + const asset::SPushConstantRange ranges[] = {{ + .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstants) + }}; + auto layout = m_device->createPipelineLayout(ranges,smart_refctd_ptr(dsLayout)); const IGPUComputePipeline::SCreationParams params[] = { { { .layout = layout.get() @@ -114,7 +120,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi if (!m_device->createComputePipelines(nullptr,params,&m_ppln)) return logFail("Failed to create Pipeline"); } -*/ + m_hdr = m_device->createImage({ { .type = IGPUImage::E_TYPE::ET_2D, @@ -134,8 +140,8 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,{&dsLayout.get(),1}); if (!pool) return logFail("Could not create Descriptor Pool"); - auto ds = pool->createDescriptorSet(std::move(dsLayout)); - if (!ds) + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + if (!m_ds) return logFail("Could not create Descriptor Set"); IGPUDescriptorSet::SDescriptorInfo info = {}; { @@ -152,7 +158,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi info.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL; } const IGPUDescriptorSet::SWriteDescriptorSet writes[] = {{ - .dstSet = ds.get(), + .dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, @@ -287,9 +293,15 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi // write the image { - // - // cb->bindComputePipeline(rawPipeline); - // push constants + cb->bindComputePipeline(m_ppln.get()); + auto* layout = m_ppln->getLayout(); + cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE,layout,0,1,&m_ds.get()); + const PushConstants pc = { + .sharedAcceptableIdleCount = 0, + .globalAcceptableIdleCount = 0 + }; + cb->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(pc),&pc); + cb->dispatch(WIN_W/WorkgroupSizeX,WIN_H/WorkgroupSizeY,1); } { @@ -409,8 +421,9 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi private: smart_refctd_ptr m_window; smart_refctd_ptr> m_surface; - smart_refctd_ptr m_hdr; smart_refctd_ptr m_ppln; + smart_refctd_ptr m_ds; + smart_refctd_ptr m_hdr; smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx : 59 = 0; uint64_t m_maxFramesInFlight : 5;