Skip to content

Commit

Permalink
full subgroups require X dimension be a multiple of subgroup size
Browse files Browse the repository at this point in the history
Also trying to narrow down the DXC bug #!@#!@#@!#!@#!@#~!
  • Loading branch information
devsh committed Oct 18, 2024
1 parent c5f12f0 commit 3415bb7
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 55 deletions.
19 changes: 4 additions & 15 deletions 27_MPMCScheduler/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
#include "nbl/builtin/hlsl/cpp_compat.hlsl"

NBL_CONSTEXPR uint32_t WorkgroupSizeX = 16;
NBL_CONSTEXPR uint32_t WorkgroupSizeY = 16;
NBL_CONSTEXPR uint32_t WorkgroupSizeX = 8;
NBL_CONSTEXPR uint32_t WorkgroupSizeY = 8;
NBL_CONSTEXPR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY;

static const uint32_t FRAMES_IN_FLIGHT = 3u;

static const uint32_t RED_OFFSET = 0u;
static const uint32_t GREEN_OFFSET = 256u;
static const uint32_t BLUE_OFFSET = 256u * 2u;

static const uint32_t CHANEL_CNT = 3;
static const uint32_t VAL_PER_CHANEL_CNT = 256;
static const uint32_t HISTOGRAM_SIZE = CHANEL_CNT * VAL_PER_CHANEL_CNT;
static const uint32_t HISTOGRAM_BYTE_SIZE = HISTOGRAM_SIZE * sizeof(uint32_t);
static const uint32_t COMBINED_HISTOGRAM_BUFFER_BYTE_SIZE = HISTOGRAM_BYTE_SIZE * FRAMES_IN_FLIGHT;

struct PushConstants
{
uint32_t histogramBufferOffset;
uint32_t sharedAcceptableIdleCount : 10;
uint32_t globalAcceptableIdleCount : 10;
};
2 changes: 1 addition & 1 deletion 27_MPMCScheduler/app_resources/schedulers/mpmc.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace schedulers
{

// TODO: improve and use a Global Pool Allocator and stop moving whole payloads around in VRAM
template<typename Task, uint32_t WorkGroupSize, typename SharedAccessor, typename GlobalQueue, class device_capabilities=void>
template<typename Task, uint32_t WorkGroupSize, typename SharedAccessor, class device_capabilities=void>
struct MPMC
{
// TODO: static asset that the signature of the `Task::operator()` is `void()`
Expand Down
134 changes: 107 additions & 27 deletions 27_MPMCScheduler/app_resources/shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
//#include "nbl/builtin/hlsl/memory_accessor.hlsl"
//#include "nbl/builtin/hlsl/type_traits.hlsl"

#include "schedulers/mpmc.hlsl"
//#include "schedulers/mpmc.hlsl"
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"

#include "common.hlsl"

#include "nbl/builtin/hlsl/limits.hlsl"
#include "nbl/builtin/hlsl/numbers.hlsl"


using namespace nbl::hlsl;


#if 0
enum Material : uint32_t
{
Emission = 0,
Expand Down Expand Up @@ -72,6 +80,7 @@ const static Sphere spheres[5] = {
Material::Glass
}
};
#endif

struct WhittedTask
{
Expand Down Expand Up @@ -123,11 +132,9 @@ struct WhittedTask

void operator()();
};
NBL_REGISTER_OBJ_TYPE(WhittedTask,8);
//NBL_REGISTER_OBJ_TYPE(WhittedTask,8);

struct GlobalAccessor
{
};
#if 0
// something something, Nvidia can do 32 bytes of smem per invocation
groupshared uint32_t sdata[512];
struct SharedAccessor
Expand All @@ -148,18 +155,16 @@ struct SharedAccessor
return nbl::hlsl::glsl::atomicAdd(sdata[ix],val);
}

template<typename T>
void set(const uint32_t ix, const in T val)
void set(const uint32_t ix, const in uint32_t val)
{
// sdata[ix] = val;
sdata[ix] = val;
}
template<typename T>
void get(const uint32_t ix, out T val)
void get(const uint32_t ix, out uint32_t val)
{
// sdata[ix] = val;
val = sdata[ix];
}
};
static nbl::hlsl::MPMCScheduler<WhittedTask,8*8,SharedAccessor,GlobalAccessor> scheduler;
static nbl::hlsl::MPMCScheduler<WhittedTask,8*8,SharedAccessor> scheduler;

// stolen from Nabla GLSL
bool nbl_glsl_getOrientedEtas(out float orientedEta, out float rcpOrientedEta, in float NdotI, in float eta)
Expand Down Expand Up @@ -191,11 +196,13 @@ float32_t3 nbl_glsl_refract(in float32_t3 I, in float32_t3 N, in bool backside,
const float NdotT = backside ? abs_NdotT:(-abs_NdotT);
return N*(NdotI*rcpOrientedEta + NdotT) - rcpOrientedEta*I;
}
#endif

[[vk::binding(0,0)]] RWTexture2D<uint32_t> framebuffer;

void WhittedTask::operator()()
{
using namespace nbl::hlsl;

#if 0
const float32_t3 rayDir = getRayDir();
const float32_t3 throughput = getThroughput();

Expand Down Expand Up @@ -244,7 +251,7 @@ void WhittedTask::operator()()

newTask.setThroughput(isGlass ? newThroughput:(color*newThroughput));
newTask.setRayDir(reflected);
scheduler.push(newTask);
// scheduler.push(newTask);
}
// deal with refraction
if (isGlass)
Expand All @@ -253,7 +260,7 @@ void WhittedTask::operator()()
newThroughput *= color;
newTask.setThroughput(newThroughput);
newTask.setRayDir(nbl_glsl_refract(-rayDir,normal,backside,NdotV,rcpOrientedEta));
scheduler.push(newTask);
// scheduler.push(newTask);

}
}
Expand All @@ -265,49 +272,122 @@ void WhittedTask::operator()()

if (contribution.r+contribution.g+contribution.b<1.f/2047.f)
return;
#endif

// Use device traits to do CAS loops on R32_UINT view of RGB9E5 when no VK_NV_shader_atomic_float16_vector
// spirv::atomicAdd(spirv::addrof(framebuffer),contribution);
framebuffer[uint32_t2(outputX,outputY)] = float32_t4(contribution,1.f);
framebuffer[uint32_t2(outputX,outputY)] = 0xffFFffFFu;
}

// move to `nbl/builtin/hlsl/shared_exp_t3.hlsl`

template<typename UintT, uint16_t ExponentBits>
struct shared_exp_t3
{
using this_t = shared_exp_t3;

UintT storage;
};

/*
uvec3 nbl_glsl_impl_sharedExponentEncodeCommon(in vec3 clamped, in int newExpBias, in int newMaxExp, in int mantissaBits, out int shared_exp)
{
const float maxrgb = max(max(clamped.r, clamped.g), clamped.b);
// TODO: optimize this
const int f32_exp = int(nbl_glsl_ieee754_extract_biased_exponent(maxrgb)) - 126;
shared_exp = clamp(f32_exp, -newExpBias, newMaxExp + 1);
float scale = exp2(mantissaBits - shared_exp);
const uint maxm = uint(maxrgb * scale + 0.5);
const bool need = maxm == (0x1u << mantissaBits);
scale = need ? 0.5 * scale : scale;
shared_exp = need ? (shared_exp + 1) : shared_exp;
return uvec3(clamped * scale + vec3(0.5));
}
uvec2 nbl_glsl_encodeRGB9E5(in vec3 col)
{
const vec3 clamped = clamp(col, vec3(0.0), vec3(nbl_glsl_MAX_RGB19E7));
int shared_exp;
const uvec3 mantissas = nbl_glsl_impl_sharedExponentEncodeCommon(clamped, nbl_glsl_RGB19E7_EXP_BIAS, nbl_glsl_MAX_RGB19E7_EXP, nbl_glsl_RGB19E7_MANTISSA_BITS, shared_exp);
uvec2 encoded;
encoded.x = bitfieldInsert(mantissas.x, mantissas.y, nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[1], nbl_glsl_RGB19E7_G_COMPONENT_SPLIT);
encoded.y = bitfieldInsert(
mantissas.y >> nbl_glsl_RGB19E7_G_COMPONENT_SPLIT,
mantissas.z,
nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[2],
nbl_glsl_RGB19E7_MANTISSA_BITS)
| uint((shared_exp + nbl_glsl_RGB19E7_EXP_BIAS) << nbl_glsl_RGB19E7_COMPONENT_BITOFFSETS[3]);
return encoded;
}
*/


struct Dummy
{
void operator()()
{
next();
}

WhittedTask next;
bool nextValid;
};
static Dummy scheduler;

[[vk::push_constant]] PushConstants pc;

// have to do weird stuff with workgroup size because of subgroup full spec
namespace nbl
{
namespace hlsl
{
namespace glsl
{
uint32_t3 gl_WorkGroupSize() {return uint32_t3(8,8,1);}
uint32_t3 gl_WorkGroupSize() {return uint32_t3(WorkgroupSizeX*WorkgroupSizeY,1,1);}
}
}
}
[numthreads(8,8,1)]
void main(uint32_t3 gl_GlobalInvocationID : SV_DispatchThreadID)
[numthreads(WorkgroupSizeX*WorkgroupSizeY,1,1)]
void main()
{
// manually push an explicit workload
{
// reconstruct the actual XY coordinate we want
uint32_t2 GlobalInvocationID = glsl::gl_WorkGroupID().xy*glsl::gl_WorkGroupSize().xy;
// TODO: morton code
{
const uint32_t linearIx = glsl::gl_LocalInvocationIndex();
GlobalInvocationID.x += linearIx%WorkgroupSizeX;
GlobalInvocationID.y += linearIx/WorkgroupSizeX;
}
#if 0
scheduler.next.origin = float32_t3(0,0,-5);
scheduler.next.setThroughput(float32_t3(1,1,1));
scheduler.next.outputX = gl_GlobalInvocationID.x;
scheduler.next.outputY = gl_GlobalInvocationID.y;
scheduler.next.outputX = GlobalInvocationID.x;
scheduler.next.outputY = GlobalInvocationID.y;
{
using namespace nbl::hlsl;
float32_t3 ndc;
{
const float32_t2 totalInvocations = glsl::gl_NumWorkGroups().xy*8.f;
ndc.xy = (float32_t2(gl_GlobalInvocationID.xy)+float32_t2(0.5,0.5))*2.f/totalInvocations-float32_t2(1,1);
ndc.xy = (float32_t2(GlobalInvocationID.xy)+float32_t2(0.5,0.5))*2.f/totalInvocations-float32_t2(1,1);
ndc.y *= totalInvocations.y/totalInvocations.x; // aspect raio
}
ndc.z = 1.f; // FOV of 90 degrees
scheduler.next.setRayDir(normalize(ndc));
}
scheduler.next.depth = 0;
#endif
// scheduler.sharedAcceptableIdleCount = 0;
// scheduler.globalAcceptableIdleCount = 0;
scheduler.nextValid = true;
}

// excute implcit as scheduled
scheduler();
#ifdef DEBUG
printf("Workgroup Quit");
#endif
// scheduler();
}
37 changes: 25 additions & 12 deletions 27_MPMCScheduler/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ using namespace nbl::asset;
using namespace nbl::ui;
using namespace nbl::video;

#include "app_resources/common.hlsl"

class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
{
Expand Down Expand Up @@ -59,7 +60,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
return false;
if (!asset_base_t::onAppInitialized(std::move(system)))
return false;
/*

smart_refctd_ptr<IGPUShader> shader;
{
IAssetLoader::SAssetLoadParams lp = {};
Expand All @@ -79,7 +80,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
if (!shader)
return false;
}
*/

smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
{
const IGPUDescriptorSetLayout::SBinding bindings[1] = { {
Expand All @@ -94,9 +95,14 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
if (!dsLayout)
return logFail("Failed to Create Descriptor Layout");
}
/*

{
auto layout = m_device->createPipelineLayout({},smart_refctd_ptr(dsLayout));
const asset::SPushConstantRange ranges[] = {{
.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
.offset = 0,
.size = sizeof(PushConstants)
}};
auto layout = m_device->createPipelineLayout(ranges,smart_refctd_ptr(dsLayout));
const IGPUComputePipeline::SCreationParams params[] = { {
{
.layout = layout.get()
Expand All @@ -114,7 +120,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
if (!m_device->createComputePipelines(nullptr,params,&m_ppln))
return logFail("Failed to create Pipeline");
}
*/

m_hdr = m_device->createImage({
{
.type = IGPUImage::E_TYPE::ET_2D,
Expand All @@ -134,8 +140,8 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE,{&dsLayout.get(),1});
if (!pool)
return logFail("Could not create Descriptor Pool");
auto ds = pool->createDescriptorSet(std::move(dsLayout));
if (!ds)
m_ds = pool->createDescriptorSet(std::move(dsLayout));
if (!m_ds)
return logFail("Could not create Descriptor Set");
IGPUDescriptorSet::SDescriptorInfo info = {};
{
Expand All @@ -152,7 +158,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
info.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL;
}
const IGPUDescriptorSet::SWriteDescriptorSet writes[] = {{
.dstSet = ds.get(),
.dstSet = m_ds.get(),
.binding = 0,
.arrayElement = 0,
.count = 1,
Expand Down Expand Up @@ -287,9 +293,15 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi

// write the image
{
//
// cb->bindComputePipeline(rawPipeline);
// push constants
cb->bindComputePipeline(m_ppln.get());
auto* layout = m_ppln->getLayout();
cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE,layout,0,1,&m_ds.get());
const PushConstants pc = {
.sharedAcceptableIdleCount = 0,
.globalAcceptableIdleCount = 0
};
cb->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(pc),&pc);
cb->dispatch(WIN_W/WorkgroupSizeX,WIN_H/WorkgroupSizeY,1);
}

{
Expand Down Expand Up @@ -409,8 +421,9 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi
private:
smart_refctd_ptr<IWindow> m_window;
smart_refctd_ptr<CSimpleResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
smart_refctd_ptr<IGPUImage> m_hdr;
smart_refctd_ptr<IGPUComputePipeline> m_ppln;
smart_refctd_ptr<IGPUDescriptorSet> m_ds;
smart_refctd_ptr<IGPUImage> m_hdr;
smart_refctd_ptr<ISemaphore> m_semaphore;
uint64_t m_realFrameIx : 59 = 0;
uint64_t m_maxFramesInFlight : 5;
Expand Down

0 comments on commit 3415bb7

Please sign in to comment.