From ed5245bb5d2b77943ebe4d97579951584c633619 Mon Sep 17 00:00:00 2001 From: "Kwasniewski, Patryk" Date: Wed, 4 Dec 2024 13:11:11 +0000 Subject: [PATCH] new intrinsic: sub group clustered ballot Adds new intrinsic: sub group clustered ballot. Works similar to sub group ballot, but each lane contains results only from its' cluster. Only cluster sizes 8 and 16 are supported. --- .../Implementation/IGCBiF_Intrinsics.cl | 1 + IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp | 1 + IGC/Compiler/CISACodeGen/CodeSinking.cpp | 1 + IGC/Compiler/CISACodeGen/EmitVISAPass.cpp | 112 ++++++++++++++++-- IGC/Compiler/CISACodeGen/EmitVISAPass.hpp | 2 + IGC/Compiler/CISACodeGen/PatternMatchPass.cpp | 2 + IGC/Compiler/CISACodeGen/WIAnalysis.cpp | 1 + IGC/Compiler/CISACodeGen/helper.cpp | 1 + IGC/Compiler/CISACodeGen/opCode.h | 1 + IGC/Compiler/Optimizer/OCLBIUtils.cpp | 7 ++ .../generator/input/Intrinsic_definitions.yml | 28 +++++ 11 files changed, 150 insertions(+), 7 deletions(-) diff --git a/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl b/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl index 173513468402..10dc49ab6717 100644 --- a/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl +++ b/IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl @@ -521,6 +521,7 @@ uint __builtin_IB_get_image_bti(uint img); // ballot intrinsic uint __builtin_IB_WaveBallot(bool p); +uint __builtin_IB_clustered_WaveBallot(bool p, uint cluster_size); // VA void __builtin_IB_va_erode_64x4( __local uchar* dst, float2 coords, int srcImgId, int i_accelerator ); diff --git a/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp b/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp index 68ac6945bd1c..e0ca71df3e74 100644 --- a/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp +++ b/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp @@ -342,6 +342,7 @@ void CheckInstrTypes::visitCallInst(CallInst& C) case GenISAIntrinsic::GenISA_WaveBallot: case GenISAIntrinsic::GenISA_wavebarrier: case GenISAIntrinsic::GenISA_WaveInverseBallot: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: case GenISAIntrinsic::GenISA_WavePrefix: case GenISAIntrinsic::GenISA_WaveClustered: case GenISAIntrinsic::GenISA_WaveInterleave: diff --git a/IGC/Compiler/CISACodeGen/CodeSinking.cpp b/IGC/Compiler/CISACodeGen/CodeSinking.cpp index dc330fa996ab..bcdf0ab88407 100644 --- a/IGC/Compiler/CISACodeGen/CodeSinking.cpp +++ b/IGC/Compiler/CISACodeGen/CodeSinking.cpp @@ -2492,6 +2492,7 @@ namespace IGC { case GenISAIntrinsic::GenISA_WaveClusteredBroadcast: case GenISAIntrinsic::GenISA_WaveBallot: case GenISAIntrinsic::GenISA_WaveInverseBallot: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: case GenISAIntrinsic::GenISA_WaveAll: case GenISAIntrinsic::GenISA_WaveClustered: case GenISAIntrinsic::GenISA_WaveInterleave: diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp index 917162aa8a42..ec68bf5f85cf 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp @@ -9178,6 +9178,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst) case GenISAIntrinsic::GenISA_WaveInverseBallot: emitWaveInverseBallot(inst); break; + case GenISAIntrinsic::GenISA_WaveClusteredBallot: + emitWaveClusteredBallot(inst); + break; case GenISAIntrinsic::GenISA_WaveShuffleIndex: case GenISAIntrinsic::GenISA_WaveBroadcast: emitSimdShuffle(inst); @@ -21726,6 +21729,23 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst) destination = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE); } + emitBallotUniform(inst, &destination, disableHelperLanes); + + if (destination != m_destination) + { + m_encoder->Cast(m_destination, destination); + m_encoder->Push(); + } + if (disableHelperLanes) + { + ResetVMask(); + } +} + +void EmitPass::emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable** destination, bool disableHelperLanes) +{ + IGC_ASSERT_MESSAGE((*destination)->IsUniform(), "Unsupported: dst must be uniform"); + bool uniform_active_lane = false; if (ConstantInt * pConst = dyn_cast(inst->getOperand(0))) { @@ -21741,9 +21761,9 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst) if (m_currShader->m_dispatchSize == SIMDMode::SIMD8 && m_currShader->HasFullDispatchMask()) { // for SIMD8 make sure the higher 8 bits of the flag are not copied - destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE); + *destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE); } - m_encoder->BoolToInt(destination, f0); + m_encoder->BoolToInt(*destination, f0); if (!m_currShader->HasFullDispatchMask()) { CVariable* dispatchMask = m_currShader->GetNewAlias( @@ -21751,7 +21771,7 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst) ISA_TYPE_UD, (m_pattern->NeedVMask() && !disableHelperLanes ? 3 : 2) * SIZE_DWORD, 1); - m_encoder->And(destination, dispatchMask, destination); + m_encoder->And(*destination, dispatchMask, *destination); } } else @@ -21770,21 +21790,99 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst) m_encoder->SetSimdSize(SIMDMode::SIMD1); m_encoder->SetNoMask(); - m_encoder->And(destination, exeMask, vf0); + m_encoder->And(*destination, exeMask, vf0); m_encoder->Push(); } else { - m_encoder->Cast(destination, exeMask); + m_encoder->Cast(*destination, exeMask); m_encoder->Push(); } } +} - if (destination != m_destination) +void EmitPass::emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst) +{ + IGC_ASSERT_MESSAGE(!m_destination->IsUniform(), "Unsupported: dst must be non-uniform"); + + IGC_ASSERT_MESSAGE(isa(inst->getOperand(1)), "Unsupported: cluster size must be constant"); + const unsigned int clusterSize = int_cast(cast(inst->getOperand(1))->getZExtValue()); + + IGC_ASSERT_MESSAGE(clusterSize <= numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller or equal to SIMD"); + IGC_ASSERT_MESSAGE(clusterSize % 8 == 0, "cluster size must be 8/16/32"); + + bool disableHelperLanes = int_cast(cast(inst->getArgOperand(2))->getSExtValue()) == 2; + if (disableHelperLanes) { - m_encoder->Cast(m_destination, destination); + ForceDMask(); + } + + // Run ballot. + CVariable* ballotResult = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, "ballotResult"); + emitBallotUniform(inst, &ballotResult, disableHelperLanes); + + // In case cluster takes full SIMD size, then just propagate result. + if (clusterSize == numLanes(m_currShader->m_dispatchSize)) + { + m_encoder->Copy(m_destination, ballotResult); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->Copy(m_destination, ballotResult); + m_encoder->SetSecondHalf(false); + } + m_encoder->Push(); + return; + } + + // ballotResult contains result from all lanes. Cluster can be either 8 or 16 lanes, so clusters in + // ballotResult are byte-aligned. Extract clusters from the result. + + CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD); + m_encoder->Copy(m_destination, zero); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->Copy(m_destination, zero); + m_encoder->SetSecondHalf(false); + } + m_encoder->Push(); + + if (clusterSize == 8) + { + CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_B, 0, 4, false); + CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_B, 0, numLanes(m_currShader->m_SIMDSize) * 4); + + m_encoder->SetSrcRegion(0, 1, 8, 0); + m_encoder->SetDstRegion(4); + m_encoder->Copy(dstAlias, ballotAlias); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->SetSrcSubReg(0, 2); + m_encoder->Copy(dstAlias, ballotAlias); + m_encoder->SetSecondHalf(false); + } + m_encoder->Push(); + } + else if (clusterSize == 16) + { + CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_UW, 0, 2, false); + CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UW, 0, numLanes(m_currShader->m_SIMDSize) * 2); + + m_encoder->SetSrcRegion(0, 1, 16, 0); + m_encoder->SetDstRegion(2); + m_encoder->Copy(dstAlias, ballotAlias); + if (m_currShader->m_numberInstance > 1) + { + m_encoder->SetSecondHalf(true); + m_encoder->SetSrcSubReg(0, 1); + m_encoder->Copy(dstAlias, ballotAlias); + m_encoder->SetSecondHalf(false); + } m_encoder->Push(); } + if (disableHelperLanes) { ResetVMask(); diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp index 02f7a43b03cf..974625558071 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp @@ -467,6 +467,8 @@ class EmitPass : public llvm::FunctionPass // CrossLane Instructions void emitWaveBallot(llvm::GenIntrinsicInst* inst); + void emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst); + void emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable** destination, bool disableHelperLanes); void emitWaveInverseBallot(llvm::GenIntrinsicInst* inst); void emitWaveShuffleIndex(llvm::GenIntrinsicInst* inst); void emitWavePrefix(llvm::WavePrefixIntrinsic* I); diff --git a/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp b/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp index 0ce38acf1ea1..0ce6087b6b66 100644 --- a/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp +++ b/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp @@ -1372,6 +1372,7 @@ namespace IGC break; case GenISAIntrinsic::GenISA_WaveBallot: case GenISAIntrinsic::GenISA_WaveInverseBallot: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: case GenISAIntrinsic::GenISA_WaveAll: case GenISAIntrinsic::GenISA_WaveClustered: case GenISAIntrinsic::GenISA_WaveInterleave: @@ -5293,6 +5294,7 @@ namespace IGC switch (I.getIntrinsicID()) { case GenISAIntrinsic::GenISA_WaveAll: + case GenISAIntrinsic::GenISA_WaveClusteredBallot: helperLaneIndex = 2; break; case GenISAIntrinsic::GenISA_WaveBallot: diff --git a/IGC/Compiler/CISACodeGen/WIAnalysis.cpp b/IGC/Compiler/CISACodeGen/WIAnalysis.cpp index 77a840a59374..6d7b353d5c32 100644 --- a/IGC/Compiler/CISACodeGen/WIAnalysis.cpp +++ b/IGC/Compiler/CISACodeGen/WIAnalysis.cpp @@ -1413,6 +1413,7 @@ WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst) intrinsic_name == llvm_waveBroadcast || intrinsic_name == llvm_waveClusteredBroadcast || intrinsic_name == llvm_waveBallot || + intrinsic_name == llvm_waveClusteredBallot || intrinsic_name == llvm_waveAll || intrinsic_name == llvm_waveClustered || intrinsic_name == llvm_waveInterleave || diff --git a/IGC/Compiler/CISACodeGen/helper.cpp b/IGC/Compiler/CISACodeGen/helper.cpp index 36e76fc33343..7f6ff7c501ca 100644 --- a/IGC/Compiler/CISACodeGen/helper.cpp +++ b/IGC/Compiler/CISACodeGen/helper.cpp @@ -1885,6 +1885,7 @@ namespace IGC opcode == llvm_waveBroadcast || opcode == llvm_waveClusteredBroadcast || opcode == llvm_waveBallot || + opcode == llvm_waveClusteredBallot || opcode == llvm_simdShuffleDown || opcode == llvm_simdBlockRead|| opcode == llvm_simdBlockReadBindless); diff --git a/IGC/Compiler/CISACodeGen/opCode.h b/IGC/Compiler/CISACodeGen/opCode.h index 25c4a5e71ac0..985d53b96c7b 100644 --- a/IGC/Compiler/CISACodeGen/opCode.h +++ b/IGC/Compiler/CISACodeGen/opCode.h @@ -281,6 +281,7 @@ DECLARE_OPCODE(GenISA_pair_to_ptr, GenISAIntrinsic, llvm_pair_to_ptr, false, fal // Wave intrinsics DECLARE_OPCODE(GenISA_WaveBallot, GenISAIntrinsic, llvm_waveBallot, false, false, false, false, false, false, false) +DECLARE_OPCODE(GenISA_WaveClusteredBallot, GenISAIntrinsic, llvm_waveClusteredBallot, false, false, false, false, false, false, false) DECLARE_OPCODE(GenISA_WaveAll, GenISAIntrinsic, llvm_waveAll, false, false, false, false, false, false, false) DECLARE_OPCODE(GenISA_WaveClustered, GenISAIntrinsic, llvm_waveClustered, false, false, false, false, false, false, false) DECLARE_OPCODE(GenISA_WaveInterleave, GenISAIntrinsic, llvm_waveInterleave, false, false, false, false, false, false, false) diff --git a/IGC/Compiler/Optimizer/OCLBIUtils.cpp b/IGC/Compiler/Optimizer/OCLBIUtils.cpp index 543359b94a38..8925e1f4a1f2 100644 --- a/IGC/Compiler/Optimizer/OCLBIUtils.cpp +++ b/IGC/Compiler/Optimizer/OCLBIUtils.cpp @@ -1206,6 +1206,12 @@ class CWaveBallotIntrinsic : public CCommand } m_args.push_back(truncInst); + + if (isaId == GenISAIntrinsic::GenISA_WaveClusteredBallot) + { + m_args.push_back(m_pCallInst->getArgOperand(1)); + } + m_args.push_back(IRB.getInt32(0)); replaceGenISACallInst(isaId); } @@ -1761,6 +1767,7 @@ CBuiltinsResolver::CBuiltinsResolver(CImagesBI::ParamMap* paramMap, CImagesBI::I // Ballot builtins m_CommandMap["__builtin_IB_WaveBallot"] = CWaveBallotIntrinsic::create(GenISAIntrinsic::GenISA_WaveBallot); + m_CommandMap["__builtin_IB_clustered_WaveBallot"] = CWaveBallotIntrinsic::create(GenISAIntrinsic::GenISA_WaveClusteredBallot); m_CommandMap[StringRef("__builtin_IB_samplepos")] = CSamplePos::create(); diff --git a/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml b/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml index 0356b6fb8bd2..2d417bd6ec63 100644 --- a/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml +++ b/IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml @@ -2171,6 +2171,34 @@ intrinsics: memory_effects: - ! memory_location: !MemoryLocation InaccessibleMem + - ! + name: "GenISA_WaveClusteredBallot" + comment: "Same as WaveBallot, but result includes only lanes for current cluster.\ + \ Works in non-uniform context." + return_definition: ! + type_definition: *i32 + comment: "return a bitfield with 1 for active lane in cluster with input true,\ + \ 0 for the rest." + arguments: + - ! + name: Arg0 + type_definition: *i1 + comment: "predicate" + - ! + name: Arg1 + type_definition: *i32 + comment: "cluster size - must be a compile time constant 8 or 16" + - ! + name: Arg2 + type_definition: *i32 + comment: "helperLaneMode : 0: not used; 1: helper lanes participatein\ + \ wave ops, 2: helper lanes do not participate in wave ops." + attributes: + - !AttributeID "Convergent" + - !AttributeID "NoUnwind" + memory_effects: + - ! + memory_location: !MemoryLocation InaccessibleMem - ! name: "GenISA_WaveClustered" comment: "Accumulate all active lanes within consecutive input clusters and\