From 57461d95bcbd3fd4af52c1dac14e53da1d17f9e9 Mon Sep 17 00:00:00 2001 From: "Kwasniewski, Patryk" Date: Thu, 28 Nov 2024 14:05:06 +0000 Subject: [PATCH] sub group clustered ballot Adds new intrinsic: sub group clustered ballot. Works similar to sub group ballot, but each lane contains results only from its' cluster. Only cluster sizes 8 and 16 are supported. --- .../Implementation/IGCBiF_Intrinsics.cl | 1 + IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp | 1 + IGC/Compiler/CISACodeGen/CodeSinking.cpp | 1 + IGC/Compiler/CISACodeGen/EmitVISAPass.cpp | 88 ++++++++++++++++++- IGC/Compiler/CISACodeGen/EmitVISAPass.hpp | 2 + IGC/Compiler/CISACodeGen/PatternMatchPass.cpp | 2 + IGC/Compiler/CISACodeGen/WIAnalysis.cpp | 1 + IGC/Compiler/CISACodeGen/helper.cpp | 1 + IGC/Compiler/CISACodeGen/opCode.h | 1 + IGC/Compiler/Optimizer/OCLBIUtils.cpp | 7 ++ .../generator/input/Intrinsic_definitions.yml | 28 ++++++ 11 files changed, 131 insertions(+), 2 deletions(-) diff --git a/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl b/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl index 173513468402..10dc49ab6717 100644 --- a/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl +++ b/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl @@ -521,6 +521,7 @@ uint __builtin_IB_get_image_bti(uint img); // ballot intrinsic uint __builtin_IB_WaveBallot(bool p); +uint __builtin_IB_clustered_WaveBallot(bool p, uint cluster_size); // VA void __builtin_IB_va_erode_64x4( __local uchar* dst, float2 coords, int srcImgId, int i_accelerator ); diff --git a/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp b/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp index 68ac6945bd1c..e0ca71df3e74 100644 --- a/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp +++ b/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp @@ -342,6 +342,7 @@ void CheckInstrTypes::visitCallInst(CallInst& C) case GenISAIntrinsic::GenISA_WaveBallot: case GenISAIntrinsic::GenISA_wavebarrier: case GenISAIntrinsic::GenISA_WaveInverseBallot: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: case GenISAIntrinsic::GenISA_WavePrefix: case GenISAIntrinsic::GenISA_WaveClustered: case GenISAIntrinsic::GenISA_WaveInterleave: diff --git a/IGC/Compiler/CISACodeGen/CodeSinking.cpp b/IGC/Compiler/CISACodeGen/CodeSinking.cpp index dc330fa996ab..bcdf0ab88407 100644 --- a/IGC/Compiler/CISACodeGen/CodeSinking.cpp +++ b/IGC/Compiler/CISACodeGen/CodeSinking.cpp @@ -2492,6 +2492,7 @@ namespace IGC { case GenISAIntrinsic::GenISA_WaveClusteredBroadcast: case GenISAIntrinsic::GenISA_WaveBallot: case GenISAIntrinsic::GenISA_WaveInverseBallot: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: case GenISAIntrinsic::GenISA_WaveAll: case GenISAIntrinsic::GenISA_WaveClustered: case GenISAIntrinsic::GenISA_WaveInterleave: diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp index 46a40e2e08b5..ca287c347657 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp @@ -9173,6 +9173,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst) case GenISAIntrinsic::GenISA_WaveInverseBallot: emitWaveInverseBallot(inst); break; + case GenISAIntrinsic::GenISA_WaveClusteredBallot: + emitWaveClusteredBallot(inst); + break; case GenISAIntrinsic::GenISA_WaveShuffleIndex: case GenISAIntrinsic::GenISA_WaveBroadcast: emitSimdShuffle(inst); @@ -21551,6 +21554,23 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst) destination = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE); } + emitBallotUniform(inst, destination, disableHelperLanes); + + if (destination != m_destination) + { + m_encoder->Cast(m_destination, destination); + m_encoder->Push(); + } + if (disableHelperLanes) + { + ResetVMask(); + } +} + +void EmitPass::emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable* destination, bool disableHelperLanes) +{ + IGC_ASSERT_MESSAGE(destination->IsUniform(), "Unsupported: dst must be uniform"); + bool uniform_active_lane = false; if (ConstantInt * pConst = dyn_cast(inst->getOperand(0))) { @@ -21604,12 +21624,76 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst) m_encoder->Push(); } } +} - if (destination != m_destination) +void EmitPass::emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst) +{ + IGC_ASSERT_MESSAGE(!m_destination->IsUniform(), "Unsupported: dst must be non-uniform"); + + IGC_ASSERT_MESSAGE(isa(inst->getOperand(1)), "Unsupported: cluster size must be constant"); + const unsigned int clusterSize = int_cast(cast(inst->getOperand(1))->getZExtValue()); + + IGC_ASSERT_MESSAGE(clusterSize < numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller than SIMD"); + IGC_ASSERT_MESSAGE(clusterSize == 8 || clusterSize == 16, "cluster size must be 8 or 16"); + + bool disableHelperLanes = int_cast(cast(inst->getArgOperand(2))->getSExtValue()) == 2; + if (disableHelperLanes) { - m_encoder->Cast(m_destination, destination); + ForceDMask(); + } + + // Run ballot. + CVariable* ballotResult = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, "ballotResult"); + emitBallotUniform(inst, ballotResult, disableHelperLanes); + + // ballotResult contains result from all lanes. Cluster can be either 8 or 16 lanes, so clusters in + // ballotResult are byte-aligned. Extract clusters from the result. + + CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD); + m_encoder->Copy(m_destination, zero); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->Copy(m_destination, zero); + m_encoder->SetSecondHalf(false); + } + m_encoder->Push(); + + if (clusterSize == 8) + { + CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_B, 0, 4, false); + CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_B, 0, numLanes(m_currShader->m_SIMDSize) * 4); + + m_encoder->SetSrcRegion(0, 1, 8, 0); + m_encoder->SetDstRegion(4); + m_encoder->Copy(dstAlias, ballotAlias); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->SetSrcSubReg(0, 2); + m_encoder->Copy(dstAlias, ballotAlias); + m_encoder->SetSecondHalf(false); + } + m_encoder->Push(); + } + else if (clusterSize == 16) + { + CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_UW, 0, 2, false); + CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UW, 0, numLanes(m_currShader->m_SIMDSize) * 2); + + m_encoder->SetSrcRegion(0, 1, 16, 0); + m_encoder->SetDstRegion(2); + m_encoder->Copy(dstAlias, ballotAlias); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->SetSrcSubReg(0, 1); + m_encoder->Copy(dstAlias, ballotAlias); + m_encoder->SetSecondHalf(false); + } m_encoder->Push(); } + if (disableHelperLanes) { ResetVMask(); diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp index dae96efc0c9a..82c24ff06d99 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp @@ -453,6 +453,8 @@ class EmitPass : public llvm::FunctionPass // CrossLane Instructions void emitWaveBallot(llvm::GenIntrinsicInst* inst); + void emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst); + void emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable* destination, bool disableHelperLanes); void emitWaveInverseBallot(llvm::GenIntrinsicInst* inst); void emitWaveShuffleIndex(llvm::GenIntrinsicInst* inst); void emitWavePrefix(llvm::WavePrefixIntrinsic* I); diff --git a/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp b/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp index 28a87ef8cb8a..bb4c8ce46a37 100644 --- a/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp +++ b/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp @@ -1367,6 +1367,7 @@ namespace IGC break; case GenISAIntrinsic::GenISA_WaveBallot: case GenISAIntrinsic::GenISA_WaveInverseBallot: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: case GenISAIntrinsic::GenISA_WaveAll: case GenISAIntrinsic::GenISA_WaveClustered: case GenISAIntrinsic::GenISA_WaveInterleave: @@ -5226,6 +5227,7 @@ namespace IGC switch (I.getIntrinsicID()) { case GenISAIntrinsic::GenISA_WaveAll: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: helperLaneIndex = 2; break; case GenISAIntrinsic::GenISA_WaveBallot: diff --git a/IGC/Compiler/CISACodeGen/WIAnalysis.cpp b/IGC/Compiler/CISACodeGen/WIAnalysis.cpp index 77a840a59374..6d7b353d5c32 100644 --- a/IGC/Compiler/CISACodeGen/WIAnalysis.cpp +++ b/IGC/Compiler/CISACodeGen/WIAnalysis.cpp @@ -1413,6 +1413,7 @@ WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst) intrinsic_name == llvm_waveBroadcast || intrinsic_name == llvm_waveClusteredBroadcast || intrinsic_name == llvm_waveBallot || + intrinsic_name == llvm_waveClusteredBallot || intrinsic_name == llvm_waveAll || intrinsic_name == llvm_waveClustered || intrinsic_name == llvm_waveInterleave || diff --git a/IGC/Compiler/CISACodeGen/helper.cpp b/IGC/Compiler/CISACodeGen/helper.cpp index 36e76fc33343..7f6ff7c501ca 100644 --- a/IGC/Compiler/CISACodeGen/helper.cpp +++ b/IGC/Compiler/CISACodeGen/helper.cpp @@ -1885,6 +1885,7 @@ namespace IGC opcode == llvm_waveBroadcast || opcode == llvm_waveClusteredBroadcast || opcode == llvm_waveBallot || + opcode == llvm_waveClusteredBallot || opcode == llvm_simdShuffleDown || opcode == llvm_simdBlockRead|| opcode == llvm_simdBlockReadBindless); diff --git a/IGC/Compiler/CISACodeGen/opCode.h b/IGC/Compiler/CISACodeGen/opCode.h index 25c4a5e71ac0..985d53b96c7b 100644 --- a/IGC/Compiler/CISACodeGen/opCode.h +++ b/IGC/Compiler/CISACodeGen/opCode.h @@ -281,6 +281,7 @@ DECLARE_OPCODE(GenISA_pair_to_ptr, GenISAIntrinsic, llvm_pair_to_ptr, false, fal // Wave intrinsics DECLARE_OPCODE(GenISA_WaveBallot, GenISAIntrinsic, llvm_waveBallot, false, false, false, false, false, false, false) +DECLARE_OPCODE(GenISA_WaveClusteredBallot, GenISAIntrinsic, llvm_waveClusteredBallot, false, false, false, false, false, false, false) DECLARE_OPCODE(GenISA_WaveAll, GenISAIntrinsic, llvm_waveAll, false, false, false, false, false, false, false) DECLARE_OPCODE(GenISA_WaveClustered, GenISAIntrinsic, llvm_waveClustered, false, false, false, false, false, false, false) DECLARE_OPCODE(GenISA_WaveInterleave, GenISAIntrinsic, llvm_waveInterleave, false, false, false, false, false, false, false) diff --git a/IGC/Compiler/Optimizer/OCLBIUtils.cpp b/IGC/Compiler/Optimizer/OCLBIUtils.cpp index 543359b94a38..8925e1f4a1f2 100644 --- a/IGC/Compiler/Optimizer/OCLBIUtils.cpp +++ b/IGC/Compiler/Optimizer/OCLBIUtils.cpp @@ -1206,6 +1206,12 @@ class CWaveBallotIntrinsic : public CCommand } m_args.push_back(truncInst); + + if (isaId == GenISAIntrinsic::GenISA_WaveClusteredBallot) + { + m_args.push_back(m_pCallInst->getArgOperand(1)); + } + m_args.push_back(IRB.getInt32(0)); replaceGenISACallInst(isaId); } @@ -1761,6 +1767,7 @@ CBuiltinsResolver::CBuiltinsResolver(CImagesBI::ParamMap* paramMap, CImagesBI::I // Ballot builtins m_CommandMap["__builtin_IB_WaveBallot"] = CWaveBallotIntrinsic::create(GenISAIntrinsic::GenISA_WaveBallot); + m_CommandMap["__builtin_IB_clustered_WaveBallot"] = CWaveBallotIntrinsic::create(GenISAIntrinsic::GenISA_WaveClusteredBallot); m_CommandMap[StringRef("__builtin_IB_samplepos")] = CSamplePos::create(); diff --git a/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml b/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml index fb212973c958..684edfe5e29c 100644 --- a/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml +++ b/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml @@ -2171,6 +2171,34 @@ intrinsics: memory_effects: - ! memory_location: !MemoryLocation InaccessibleMem + - ! + name: "GenISA_WaveClusteredBallot" + comment: "Same as WaveBallot, but result includes only lanes for current cluster.\ + \ Works in non-uniform context." + return_definition: ! + type_definition: *i32 + comment: "return a bitfield with 1 for active lane in cluster with input true,\ + \ 0 for the rest." + arguments: + - ! + name: Arg0 + type_definition: *i1 + comment: "predicate" + - ! + name: Arg1 + type_definition: *i32 + comment: "cluster size - must be a compile time constant 8 or 16" + - ! + name: Arg2 + type_definition: *i32 + comment: "helperLaneMode : 0: not used; 1: helper lanes participatein\ + \ wave ops, 2: helper lanes do not participate in wave ops." + attributes: + - !AttributeID "Convergent" + - !AttributeID "NoUnwind" + memory_effects: + - ! + memory_location: !MemoryLocation InaccessibleMem - ! name: "GenISA_WaveClustered" comment: "Accumulate all active lanes within consecutive input clusters and\