Skip to content

Commit

Permalink
Refine the code of removing undef output value (GPUOpen-Drivers#2684)
Browse files Browse the repository at this point in the history
Currently, we remove undef output values unconditionally for
non-fragment shaders. It causes mismatch for partial channels with undef
value. The correct way is to conditionally remove if all channels' value
are undefined. This PR will integrete the removal code piece into
`clearUndefinedOutput()` and add the check logic in it.
  • Loading branch information
xuechen417 authored Sep 18, 2023
1 parent 9c2184a commit 3d7228a
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 50 deletions.
1 change: 1 addition & 0 deletions lgc/include/lgc/patch/PatchResourceCollect.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class PatchResourceCollect : public Patch,
void clearInactiveBuiltInInput();
void clearInactiveBuiltInOutput();
void clearUnusedOutput();
void clearUndefinedOutput();

void matchGenericInOut();
void mapBuiltInToGenericInOut();
Expand Down
150 changes: 106 additions & 44 deletions lgc/patch/PatchResourceCollect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,7 @@ void PatchResourceCollect::processShader() {

clearInactiveBuiltInInput();
clearInactiveBuiltInOutput();
clearUndefinedOutput();

if (m_pipelineState->isGraphics()) {
matchGenericInOut();
Expand Down Expand Up @@ -1302,50 +1303,7 @@ void PatchResourceCollect::visitCallInst(CallInst &callInst) {
unsigned builtInId = cast<ConstantInt>(callInst.getOperand(0))->getZExtValue();
m_importedOutputBuiltIns.insert(builtInId);
} else if (mangledName.startswith(lgcName::OutputExportGeneric)) {
auto outputValue = callInst.getArgOperand(callInst.arg_size() - 1);
if (m_shaderStage != ShaderStageFragment && (isa<UndefValue>(outputValue) || isa<PoisonValue>(outputValue))) {
// NOTE: If an output value of vertex processing stages is unspecified, we can safely drop it and remove the
// output export call.
m_deadCalls.push_back(&callInst);

if (m_pipelineState->getNextShaderStage(m_shaderStage) != ShaderStageInvalid) {
// Also, we remove the output location info from the map if it exists
const unsigned location = cast<ConstantInt>(callInst.getArgOperand(0))->getZExtValue();
unsigned component = cast<ConstantInt>(callInst.getArgOperand(1))->getZExtValue();
if (outputValue->getType()->getScalarSizeInBits() == 64)
component *= 2; // Component in location info is dword-based

InOutLocationInfo outLocInfo;
outLocInfo.setLocation(location);
outLocInfo.setComponent(component);
if (m_shaderStage == ShaderStageGeometry)
outLocInfo.setStreamId(cast<ConstantInt>(callInst.getArgOperand(2))->getZExtValue());

auto &outLocInfoMap = m_resUsage->inOutUsage.outputLocInfoMap;
if (outLocInfoMap.count(outLocInfo) > 0) {
outLocInfoMap.erase(outLocInfo);
if (outputValue->getType()->getPrimitiveSizeInBits() > 128) {
// NOTE: For any data that is larger than <4 x dword>, there are two consecutive locations occupied.
outLocInfo.setLocation(location + 1);
outLocInfoMap.erase(outLocInfo);
}
}

// Remove transform feedback location info as well if it exists
outLocInfo.setLocation(location);
auto &locInfoXfbOutInfoMap = m_resUsage->inOutUsage.locInfoXfbOutInfoMap;
if (locInfoXfbOutInfoMap.count(outLocInfo) > 0) {
locInfoXfbOutInfoMap.erase(outLocInfo);
if (outputValue->getType()->getPrimitiveSizeInBits() > 128) {
// NOTE: For any data that is larger than <4 x dword>, there are two consecutive locations occupied.
outLocInfo.setLocation(location + 1);
locInfoXfbOutInfoMap.erase(outLocInfo);
}
}
}
} else {
m_outputCalls.push_back(&callInst);
}
m_outputCalls.push_back(&callInst);
} else if (mangledName.startswith(lgcName::OutputExportBuiltIn)) {
// NOTE: If an output value is unspecified, we can safely drop it and remove the output export call.
// Currently, do this for geometry shader.
Expand Down Expand Up @@ -3595,6 +3553,110 @@ void PatchResourceCollect::scalarizeGenericOutput(CallInst *call) {
call->eraseFromParent();
}

// =====================================================================================================================
// Clear non-specified output value in non-fragment shader stages
void PatchResourceCollect::clearUndefinedOutput() {
if (m_shaderStage == ShaderStageFragment)
return;
// NOTE: If a vector or all used channels in a location are not specified, we can safely drop it and remove the output
// export call
struct CandidateInfo {
unsigned undefMask = 0;
unsigned usedMask = 0;
SmallVector<CallInst *> candidateCalls;
};
// Collect candidate info with undefined value at a location.
std::map<InOutLocationInfo, CandidateInfo> locCandidateInfoMap;

for (auto call : m_outputCalls) {
auto outputValue = call->getArgOperand(call->arg_size() - 1);
bool isUndefVal = isa<UndefValue>(outputValue) || isa<PoisonValue>(outputValue);
unsigned index = (m_shaderStage == ShaderStageMesh || m_shaderStage == ShaderStageTessControl) ? 2 : 1;
bool isDynElemIndexing = !isa<ConstantInt>(call->getArgOperand(index));

InOutLocationInfo locInfo;
locInfo.setLocation(cast<ConstantInt>(call->getArgOperand(0))->getZExtValue());
if (m_shaderStage == ShaderStageGeometry)
locInfo.setStreamId(cast<ConstantInt>(call->getArgOperand(2))->getZExtValue());

unsigned undefMask = 0;
unsigned usedMask = 0;
if (isDynElemIndexing)
usedMask = 1; // keep the call
else {
const unsigned elemIdx = cast<ConstantInt>(call->getArgOperand(index))->getZExtValue();
usedMask = 1 << elemIdx;
if (isUndefVal)
undefMask = 1 << elemIdx;
}

auto iter = locCandidateInfoMap.find(locInfo);
if (iter == locCandidateInfoMap.end()) {
CandidateInfo candidataInfo;
candidataInfo.undefMask = undefMask;
candidataInfo.usedMask = usedMask;
candidataInfo.candidateCalls.push_back(call);
locCandidateInfoMap[locInfo] = candidataInfo;
} else {
iter->second.undefMask |= undefMask;
iter->second.usedMask |= usedMask;
iter->second.candidateCalls.push_back(call);
}
}
m_outputCalls.clear();
// Check if all used channels are undefined in a location in a stream
for (auto &locCandidate : locCandidateInfoMap) {
auto candidateCalls = locCandidate.second.candidateCalls;
if (locCandidate.second.usedMask != locCandidate.second.undefMask) {
m_outputCalls.insert(m_outputCalls.end(), candidateCalls.begin(), candidateCalls.end());
continue;
}

m_deadCalls.insert(m_deadCalls.end(), candidateCalls.begin(), candidateCalls.end());

for (auto call : candidateCalls) {
// For unlinked case, we should keep the location info map unchanged.
if (m_pipelineState->getNextShaderStage(m_shaderStage) != ShaderStageInvalid) {
// Remove the output location info if it exists
unsigned index = m_shaderStage == ShaderStageMesh ? 2 : 1;
unsigned component = cast<ConstantInt>(call->getArgOperand(index))->getZExtValue();
auto outputValue = call->getArgOperand(call->arg_size() - 1);
if (outputValue->getType()->getScalarSizeInBits() == 64)
component *= 2; // Component in location info is dword-based

InOutLocationInfo outLocInfo;
const unsigned location = locCandidate.first.getLocation();
outLocInfo.setLocation(location);
outLocInfo.setComponent(component);
if (m_shaderStage == ShaderStageGeometry)
outLocInfo.setStreamId(locCandidate.first.getStreamId());

auto &outLocInfoMap = m_resUsage->inOutUsage.outputLocInfoMap;
if (outLocInfoMap.count(outLocInfo) > 0) {
outLocInfoMap.erase(outLocInfo);
if (outputValue->getType()->getPrimitiveSizeInBits() > 128) {
// NOTE: For any data that is larger than <4 x dword>, there are two consecutive locations occupied.
outLocInfo.setLocation(location + 1);
outLocInfoMap.erase(outLocInfo);
}
}

// Remove transform location info if it exists
outLocInfo.setLocation(location);
auto &locInfoXfbOutInfoMap = m_resUsage->inOutUsage.locInfoXfbOutInfoMap;
if (locInfoXfbOutInfoMap.count(outLocInfo) > 0) {
locInfoXfbOutInfoMap.erase(outLocInfo);
if (outputValue->getType()->getPrimitiveSizeInBits() > 128) {
// NOTE: For any data that is larger than <4 x dword>, there are two consecutive locations occupied.
outLocInfo.setLocation(location + 1);
locInfoXfbOutInfoMap.erase(outLocInfo);
}
}
}
}
}
}

// =====================================================================================================================
// Create a locationInfo map for the given shader stage
//
Expand Down
24 changes: 18 additions & 6 deletions llpc/test/shaderdb/general/UndefVertexOutput.spvasm
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
OpCapability Shader
%1 = OpExtInstImport "GLSL.std.450"
OpMemoryModel Logical GLSL450
OpEntryPoint Vertex %2 "main" %3 %gl_Position %5 %6 %7
OpEntryPoint Vertex %2 "main" %3 %gl_Position %5 %6 %7 %9 %11
%8 = OpString ""
OpDecorate %3 Location 1
OpDecorate %3 Binding 0
Expand All @@ -19,6 +19,12 @@
OpDecorate %6 Binding 0
OpDecorate %7 Location 8
OpDecorate %7 Binding 0
OpDecorate %9 Location 9
OpDecorate %9 Component 0
OpDecorate %9 Binding 0
OpDecorate %11 Location 9
OpDecorate %11 Component 1
OpDecorate %11 Binding 0
%void = OpTypeVoid
%10 = OpTypeFunction %void
%float = OpTypeFloat 32
Expand All @@ -29,10 +35,14 @@
%_ptr_Output_v4float = OpTypePointer Output %v4float
%gl_Position = OpVariable %_ptr_Output_v4float Output
%_ptr_Output_v3float = OpTypePointer Output %v3float
%_ptr_Output_float = OpTypePointer Output %float
%5 = OpVariable %_ptr_Output_v3float Output
%6 = OpVariable %_ptr_Output_v4float Output
%7 = OpVariable %_ptr_Output_v4float Output
%9 = OpVariable %_ptr_Output_float Output
%11 = OpVariable %_ptr_Output_float Output
%17 = OpUndef %v3float
%24 = OpUndef %float
%float_0 = OpConstant %float 0
%float_1 = OpConstant %float 1
%20 = OpConstantComposite %v4float %float_1 %float_0 %float_0 %float_1
Expand All @@ -44,21 +54,23 @@
OpStore %5 %17
OpStore %6 %21
OpStore %7 %20
OpStore %9 %24
OpStore %11 %float_0
OpReturn
OpFunctionEnd

; CHECK-LABEL: amdgpu_vs_main:
; CHECK: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_mov_b32 s0, s1
; CHECK-NEXT: s_mov_b32 s1, s5
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
; CHECK-NEXT: v_add_u32_e32 v0, s2, v0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: v_mov_b32_e32 v5, 1.0
; CHECK-NEXT: v_mov_b32_e32 v4, 1.0
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[4:7], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: exp pos0 v0, v1, v2, v3 done
; CHECK-NEXT: exp param8 v5, v4, v4, v5
; CHECK-NEXT: exp param7 v4, v4, v5, v5
; CHECK-NEXT: exp param7 v5, v5, v4, v4
; CHECK-NEXT: exp param9 off, v5, off, off
; CHECK-NEXT: exp param8 v4, v5, v5, v4
; CHECK-NEXT: s_endpgm

0 comments on commit 3d7228a

Please sign in to comment.