From 3415bb72e31acd19d39f6170f76298caaf9e79c0 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Fri, 18 Oct 2024 17:12:53 +0200
Subject: [PATCH] full subgroups require X dimension be a multiple of subgroup
 size

Also trying to narrow down the DXC bug #!@#!@#@!#!@#!@#~!
---
 27_MPMCScheduler/app_resources/common.hlsl    |  19 +--
 .../app_resources/schedulers/mpmc.hlsl        |   2 +-
 .../app_resources/shader.comp.hlsl            | 134 ++++++++++++++----
 27_MPMCScheduler/main.cpp                     |  37 +++--
 4 files changed, 137 insertions(+), 55 deletions(-)
diff --git a/27_MPMCScheduler/app_resources/common.hlsl b/27_MPMCScheduler/app_resources/common.hlsl
index 259d5069d..2fb8971ad 100644
--- a/27_MPMCScheduler/app_resources/common.hlsl
+++ b/27_MPMCScheduler/app_resources/common.hlsl
@@ -1,22 +1,11 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-NBL_CONSTEXPR uint32_t WorkgroupSizeX = 16;
-NBL_CONSTEXPR uint32_t WorkgroupSizeY = 16;
+NBL_CONSTEXPR uint32_t WorkgroupSizeX = 8;
+NBL_CONSTEXPR uint32_t WorkgroupSizeY = 8;
 NBL_CONSTEXPR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY;
 
-static const uint32_t FRAMES_IN_FLIGHT = 3u;
-
-static const uint32_t RED_OFFSET = 0u;
-static const uint32_t GREEN_OFFSET = 256u;
-static const uint32_t BLUE_OFFSET = 256u * 2u;
-
-static const uint32_t CHANEL_CNT = 3;
-static const uint32_t VAL_PER_CHANEL_CNT = 256;
-static const uint32_t HISTOGRAM_SIZE = CHANEL_CNT * VAL_PER_CHANEL_CNT;
-static const uint32_t HISTOGRAM_BYTE_SIZE = HISTOGRAM_SIZE * sizeof(uint32_t);
-static const uint32_t COMBINED_HISTOGRAM_BUFFER_BYTE_SIZE = HISTOGRAM_BYTE_SIZE * FRAMES_IN_FLIGHT;
-
 struct PushConstants
 {
-    uint32_t histogramBufferOffset;
+    uint32_t sharedAcceptableIdleCount : 10;
+    uint32_t globalAcceptableIdleCount : 10;
 };
\ No newline at end of file
diff --git a/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl b/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
index 2e345bd3e..a96b0d9b3 100644
--- a/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
+++ b/27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
@@ -16,7 +16,7 @@ namespace schedulers
 {
 
 // TODO: improve and use a Global Pool Allocator and stop moving whole payloads around in VRAM
-template<typename Task, uint32_t WorkGroupSize, typename SharedAccessor, typename GlobalQueue, class device_capabilities=void>
+template<typename Task, uint32_t WorkGroupSize, typename SharedAccessor, class device_capabilities=void>
 struct MPMC
 {
     // TODO: static asset that the signature of the `Task::operator()` is `void()`
diff --git a/27_MPMCScheduler/app_resources/shader.comp.hlsl b/27_MPMCScheduler/app_resources/shader.comp.hlsl
index a58cef874..e08542c6d 100644
--- a/27_MPMCScheduler/app_resources/shader.comp.hlsl
+++ b/27_MPMCScheduler/app_resources/shader.comp.hlsl
@@ -1,11 +1,19 @@
 //#include "nbl/builtin/hlsl/memory_accessor.hlsl"
 //#include "nbl/builtin/hlsl/type_traits.hlsl"
 
-#include "schedulers/mpmc.hlsl"
+//#include "schedulers/mpmc.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+
+#include "common.hlsl"
 
 #include "nbl/builtin/hlsl/limits.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
 
+
+using namespace nbl::hlsl;
+
+
+#if 0
 enum Material : uint32_t
 {
     Emission = 0,
@@ -72,6 +80,7 @@ const static Sphere spheres[5] = {
         Material::Glass
     }
 };
+#endif
 
 struct WhittedTask
 {
@@ -123,11 +132,9 @@ struct WhittedTask
 
     void operator()();
 };
-NBL_REGISTER_OBJ_TYPE(WhittedTask,8);
+//NBL_REGISTER_OBJ_TYPE(WhittedTask,8);
 
-struct GlobalAccessor
-{
-};
+#if 0
 // something something, Nvidia can do 32 bytes of smem per invocation
 groupshared uint32_t sdata[512];
 struct SharedAccessor
@@ -148,18 +155,16 @@ struct SharedAccessor
         return nbl::hlsl::glsl::atomicAdd(sdata[ix],val);
     }
     
-    template<typename T>
-    void set(const uint32_t ix, const in T val)
+    void set(const uint32_t ix, const in uint32_t val)
     {
-//        sdata[ix] = val;
+        sdata[ix] = val;
     }
-    template<typename T>
-    void get(const uint32_t ix, out T val)
+    void get(const uint32_t ix, out uint32_t val)
     {
-//        sdata[ix] = val;
+        val = sdata[ix];
     }
 };
-static nbl::hlsl::MPMCScheduler<WhittedTask,8*8,SharedAccessor,GlobalAccessor> scheduler;
+static nbl::hlsl::MPMCScheduler<WhittedTask,8*8,SharedAccessor> scheduler;
 
 // stolen from Nabla GLSL
 bool nbl_glsl_getOrientedEtas(out float orientedEta, out float rcpOrientedEta, in float NdotI, in float eta)
@@ -191,11 +196,13 @@ float32_t3 nbl_glsl_refract(in float32_t3 I, in float32_t3 N, in bool backside,
     const float NdotT = backside ? abs_NdotT:(-abs_NdotT);
     return N*(NdotI*rcpOrientedEta + NdotT) - rcpOrientedEta*I;
 }
+#endif
+
+[[vk::binding(0,0)]] RWTexture2D<uint32_t> framebuffer;
 
 void WhittedTask::operator()()
 {
-    using namespace nbl::hlsl;
-
+#if 0
     const float32_t3 rayDir = getRayDir();
     const float32_t3 throughput = getThroughput();
 
@@ -244,7 +251,7 @@ void WhittedTask::operator()()
 
                 newTask.setThroughput(isGlass ? newThroughput:(color*newThroughput));
                 newTask.setRayDir(reflected);
-                scheduler.push(newTask);
+//                scheduler.push(newTask);
             }
             // deal with refraction
             if (isGlass)
@@ -253,7 +260,7 @@ void WhittedTask::operator()()
                 newThroughput *= color;
                 newTask.setThroughput(newThroughput);
                 newTask.setRayDir(nbl_glsl_refract(-rayDir,normal,backside,NdotV,rcpOrientedEta));
-                scheduler.push(newTask);
+//                scheduler.push(newTask);
 
             }
         }
@@ -265,49 +272,122 @@ void WhittedTask::operator()()
 
     if (contribution.r+contribution.g+contribution.b<1.f/2047.f)
         return;
+#endif
 
     // Use device traits to do CAS loops on R32_UINT view of RGB9E5 when no VK_NV_shader_atomic_float16_vector
 //    spirv::atomicAdd(spirv::addrof(framebuffer),contribution);
-    framebuffer[uint32_t2(outputX,outputY)] = float32_t4(contribution,1.f);
+    framebuffer[uint32_t2(outputX,outputY)] = 0xffFFffFFu;
 }
 
+// move to `nbl/builtin/hlsl/shared_exp_t3.hlsl`
+
+template<typename UintT, uint16_t ExponentBits>
+struct shared_exp_t3
+{
+    using this_t = shared_exp_t3;
+
+    UintT storage;
+};
+
+/*
+uvec3 nbl_glsl_impl_sharedExponentEncodeCommon(in vec3 clamped, in int newExpBias, in int newMaxExp, in int mantissaBits, out int shared_exp)
+{
+    const float maxrgb = max(max(clamped.r, clamped.g), clamped.b);
+    // TODO: optimize this
+    const int f32_exp = int(nbl_glsl_ieee754_extract_biased_exponent(maxrgb)) - 126;
+
+    shared_exp = clamp(f32_exp, -newExpBias, newMaxExp + 1);
+
+    float scale = exp2(mantissaBits - shared_exp);
+    const uint maxm = uint(maxrgb * scale + 0.5);
+    const bool need = maxm == (0x1u << mantissaBits);
+    scale = need ? 0.5 * scale : scale;
+    shared_exp = need ? (shared_exp + 1) : shared_exp;
+    return uvec3(clamped * scale + vec3(0.5));
+}
+
+uvec2 nbl_glsl_encodeRGB9E5(in vec3 col)
+{
+    const vec3 clamped = clamp(col, vec3(0.0), vec3(nbl_glsl_MAX_RGB19E7));
+
+    int shared_exp;
+    const uvec3 mantissas = nbl_glsl_impl_sharedExponentEncodeCommon(clamped, nbl_glsl_RGB19E7_EXP_BIAS, nbl_glsl_MAX_RGB19E7_EXP, nbl_glsl_RGB19E7_MANTISSA_BITS, shared_exp);
+
+    uvec2 encoded;
+    encoded.x = bitfieldInsert(mantissas.x, mantissas.y, nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[1], nbl_glsl_RGB19E7_G_COMPONENT_SPLIT);
+    encoded.y = bitfieldInsert(
+        mantissas.y >> nbl_glsl_RGB19E7_G_COMPONENT_SPLIT,
+        mantissas.z,
+        nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[2],
+        nbl_glsl_RGB19E7_MANTISSA_BITS)
+        | uint((shared_exp + nbl_glsl_RGB19E7_EXP_BIAS) << nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[3]);
+
+    return encoded;
+}
+*/
+
+
+struct Dummy
+{
+    void operator()()
+    {
+        next();
+    }
+
+    WhittedTask next;
+    bool nextValid;
+};
+static Dummy scheduler;
+
+[[vk::push_constant]] PushConstants pc;
+
+// have to do weird stuff with workgroup size because of subgroup full spec
 namespace nbl
 {
 namespace hlsl
 {
 namespace glsl
 {
-uint32_t3 gl_WorkGroupSize() {return uint32_t3(8,8,1);}
+uint32_t3 gl_WorkGroupSize() {return uint32_t3(WorkgroupSizeX*WorkgroupSizeY,1,1);}
 }
 }
 }
-[numthreads(8,8,1)]
-void main(uint32_t3 gl_GlobalInvocationID : SV_DispatchThreadID)
+[numthreads(WorkgroupSizeX*WorkgroupSizeY,1,1)]
+void main()
 {
     // manually push an explicit workload
     {
+        // reconstruct the actual XY coordinate we want
+        uint32_t2 GlobalInvocationID = glsl::gl_WorkGroupID().xy*glsl::gl_WorkGroupSize().xy;
+        // TODO: morton code 
+        {
+            const uint32_t linearIx = glsl::gl_LocalInvocationIndex();
+            GlobalInvocationID.x += linearIx%WorkgroupSizeX;
+            GlobalInvocationID.y += linearIx/WorkgroupSizeX;
+        }
+#if 0
         scheduler.next.origin = float32_t3(0,0,-5);
         scheduler.next.setThroughput(float32_t3(1,1,1));
-        scheduler.next.outputX = gl_GlobalInvocationID.x;
-        scheduler.next.outputY = gl_GlobalInvocationID.y;
+        scheduler.next.outputX = GlobalInvocationID.x;
+        scheduler.next.outputY = GlobalInvocationID.y;
         {
             using namespace nbl::hlsl;
             float32_t3 ndc;
             {
                 const float32_t2 totalInvocations = glsl::gl_NumWorkGroups().xy*8.f;
-                ndc.xy = (float32_t2(gl_GlobalInvocationID.xy)+float32_t2(0.5,0.5))*2.f/totalInvocations-float32_t2(1,1);
+                ndc.xy = (float32_t2(GlobalInvocationID.xy)+float32_t2(0.5,0.5))*2.f/totalInvocations-float32_t2(1,1);
                 ndc.y *= totalInvocations.y/totalInvocations.x; // aspect raio
             }
             ndc.z = 1.f; // FOV of 90 degrees
             scheduler.next.setRayDir(normalize(ndc));
         }
         scheduler.next.depth = 0;
+#endif
+//        scheduler.sharedAcceptableIdleCount = 0;
+//        scheduler.globalAcceptableIdleCount = 0;
         scheduler.nextValid = true;
     }
 
     // excute implcit as scheduled
-    scheduler();
-#ifdef DEBUG
-    printf("Workgroup Quit");
-#endif
+//    scheduler();
 }
diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp
index f2a14b452..05ca1efd8 100644
--- a/27_MPMCScheduler/main.cpp
+++ b/27_MPMCScheduler/main.cpp
@@ -12,6 +12,7 @@ using namespace nbl::asset;
 using namespace nbl::ui;
 using namespace nbl::video;
 
+#include "app_resources/common.hlsl"
 
 class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -59,7 +60,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 				return false;
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
-/*
+
 			smart_refctd_ptr<IGPUShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
@@ -79,7 +80,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 				if (!shader)
 					return false;
 			}
-*/			
+			
 			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
 			{
 				const IGPUDescriptorSetLayout::SBinding bindings[1] = { {
@@ -94,9 +95,14 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 				if (!dsLayout)
 					return logFail("Failed to Create Descriptor Layout");
 			}
-/*
+
 			{
-				auto layout = m_device->createPipelineLayout({},smart_refctd_ptr(dsLayout));
+				const asset::SPushConstantRange ranges[] = {{
+					.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+					.offset = 0,
+					.size = sizeof(PushConstants)
+				}};
+				auto layout = m_device->createPipelineLayout(ranges,smart_refctd_ptr(dsLayout));
 				const IGPUComputePipeline::SCreationParams params[] = { {
 					{
 						.layout = layout.get()
@@ -114,7 +120,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 				if (!m_device->createComputePipelines(nullptr,params,&m_ppln))
 					return logFail("Failed to create Pipeline");
 			}
-*/
+
 			m_hdr = m_device->createImage({
 				{
 					.type = IGPUImage::E_TYPE::ET_2D,
@@ -134,8 +140,8 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 				auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,{&dsLayout.get(),1});
 				if (!pool)
 					return logFail("Could not create Descriptor Pool");
-				auto ds = pool->createDescriptorSet(std::move(dsLayout));
-				if (!ds)
+				m_ds = pool->createDescriptorSet(std::move(dsLayout));
+				if (!m_ds)
 					return logFail("Could not create Descriptor Set");
 				IGPUDescriptorSet::SDescriptorInfo info = {};
 				{
@@ -152,7 +158,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 					info.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL;
 				}
 				const IGPUDescriptorSet::SWriteDescriptorSet writes[] = {{
-					.dstSet = ds.get(),
+					.dstSet = m_ds.get(),
 					.binding = 0,
 					.arrayElement = 0,
 					.count = 1,
@@ -287,9 +293,15 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 
 			// write the image
 			{
-				//
-	//			cb->bindComputePipeline(rawPipeline);
-	// push constants
+				cb->bindComputePipeline(m_ppln.get());
+				auto* layout = m_ppln->getLayout();
+				cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE,layout,0,1,&m_ds.get());
+				const PushConstants pc = {
+					.sharedAcceptableIdleCount = 0,
+					.globalAcceptableIdleCount = 0
+				};
+				cb->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(pc),&pc);
+				cb->dispatch(WIN_W/WorkgroupSizeX,WIN_H/WorkgroupSizeY,1);
 			}
 
 			{
@@ -409,8 +421,9 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
 	private:
 		smart_refctd_ptr<IWindow> m_window;
 		smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
-		smart_refctd_ptr<IGPUImage> m_hdr;
 		smart_refctd_ptr<IGPUComputePipeline> m_ppln;
+		smart_refctd_ptr<IGPUDescriptorSet> m_ds;
+		smart_refctd_ptr<IGPUImage> m_hdr;
 		smart_refctd_ptr<ISemaphore> m_semaphore;
 		uint64_t m_realFrameIx : 59 = 0;
 		uint64_t m_maxFramesInFlight : 5;