Skip to content

Commit

Permalink
[DA] GPUDivergenceAnalysis for unstructured GPU kernels
Browse files Browse the repository at this point in the history
Summary:
This is patch #3 of the new DivergenceAnalysis

  <https://lists.llvm.org/pipermail/llvm-dev/2018-May/123606.html>

The GPUDivergenceAnalysis is intended to eventually supersede the existing
LegacyDivergenceAnalysis. The existing LegacyDivergenceAnalysis produces
incorrect results on unstructured Control-Flow Graphs:

  <https://bugs.llvm.org/show_bug.cgi?id=37185>

This patch adds the option -use-gpu-divergence-analysis to the
LegacyDivergenceAnalysis to turn it into a transparent wrapper for the
GPUDivergenceAnalysis.

Reviewers: nhaehnle

Reviewed By: nhaehnle

Subscribers: jholewinski, jvesely, jfb, llvm-commits, alex-t, sameerds, arsenm, nhaehnle

Differential Revision: https://reviews.llvm.org/D53493

llvm-svn: 348048
  • Loading branch information
nhaehnle committed Nov 30, 2018
1 parent 39298ca commit 56d0ed2
Show file tree
Hide file tree
Showing 23 changed files with 1,359 additions and 27 deletions.
27 changes: 27 additions & 0 deletions llvm/include/llvm/Analysis/DivergenceAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,33 @@ class DivergenceAnalysis {
std::vector<const Instruction *> Worklist;
};

/// \brief Divergence analysis frontend for GPU kernels.
class GPUDivergenceAnalysis {
SyncDependenceAnalysis SDA;
DivergenceAnalysis DA;

public:
/// Runs the divergence analysis on @F, a GPU kernel
GPUDivergenceAnalysis(Function &F, const DominatorTree &DT,
const PostDominatorTree &PDT, const LoopInfo &LI,
const TargetTransformInfo &TTI);

/// Whether any divergence was detected.
bool hasDivergence() const { return DA.hasDetectedDivergence(); }

/// The GPU kernel this analysis result is for
const Function &getFunction() const { return DA.getFunction(); }

/// Whether \p V is divergent.
bool isDivergent(const Value &V) const;

/// Whether \p V is uniform/non-divergent
bool isUniform(const Value &V) const { return !isDivergent(V); }

/// Print all divergent values in the kernel.
void print(raw_ostream &OS, const Module *) const;
};

} // namespace llvm

#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
10 changes: 9 additions & 1 deletion llvm/include/llvm/Analysis/LegacyDivergenceAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
#include "llvm/Analysis/DivergenceAnalysis.h"

namespace llvm {
class Value;
class GPUDivergenceAnalysis;
class LegacyDivergenceAnalysis : public FunctionPass {
public:
static char ID;
Expand All @@ -41,7 +43,7 @@ class LegacyDivergenceAnalysis : public FunctionPass {
//
// Even if this function returns false, V may still be divergent when used
// in a different basic block.
bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
bool isDivergent(const Value *V) const;

// Returns true if V is uniform/non-divergent.
//
Expand All @@ -53,6 +55,12 @@ class LegacyDivergenceAnalysis : public FunctionPass {
void removeValue(const Value *V) { DivergentValues.erase(V); }

private:
// Whether analysis should be performed by GPUDivergenceAnalysis.
bool shouldUseGPUDivergenceAnalysis(const Function &F) const;

// (optional) handle to new DivergenceAnalysis
std::unique_ptr<GPUDivergenceAnalysis> gpuDA;

// Stores all divergent values.
DenseSet<const Value *> DivergentValues;
};
Expand Down
33 changes: 33 additions & 0 deletions llvm/lib/Analysis/DivergenceAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,3 +422,36 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
OS << "DIVERGENT:" << I << '\n';
}
}

// class GPUDivergenceAnalysis
GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
const DominatorTree &DT,
const PostDominatorTree &PDT,
const LoopInfo &LI,
const TargetTransformInfo &TTI)
: SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
for (auto &I : instructions(F)) {
if (TTI.isSourceOfDivergence(&I)) {
DA.markDivergent(I);
} else if (TTI.isAlwaysUniform(&I)) {
DA.addUniformOverride(I);
}
}
for (auto &Arg : F.args()) {
if (TTI.isSourceOfDivergence(&Arg)) {
DA.markDivergent(Arg);
}
}

DA.compute();
}

bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
return DA.isDivergent(val);
}

void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
DA.print(OS, mod);
OS << "}\n";
}
101 changes: 75 additions & 26 deletions llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==//
//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis
//Implementation -==//
//
// The LLVM Compiler Infrastructure
//
Expand Down Expand Up @@ -64,6 +65,9 @@
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/PostDominators.h"
Expand All @@ -79,6 +83,12 @@ using namespace llvm;

#define DEBUG_TYPE "divergence"

// transparently use the GPUDivergenceAnalysis
static cl::opt<bool> UseGPUDA("use-gpu-divergence-analysis", cl::init(false),
cl::Hidden,
cl::desc("turn the LegacyDivergenceAnalysis into "
"a wrapper for GPUDivergenceAnalysis"));

namespace {

class DivergencePropagator {
Expand Down Expand Up @@ -262,16 +272,17 @@ void DivergencePropagator::propagate() {
}
}

} /// end namespace anonymous
} // namespace

// Register this pass.
char LegacyDivergenceAnalysis::ID = 0;
INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
false, true)
INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence",
"Legacy Divergence Analysis", false, true)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
false, true)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence",
"Legacy Divergence Analysis", false, true)

FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
return new LegacyDivergenceAnalysis();
Expand All @@ -280,9 +291,24 @@ FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<PostDominatorTreeWrapperPass>();
if (UseGPUDA)
AU.addRequired<LoopInfoWrapperPass>();
AU.setPreservesAll();
}

bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis(
const Function &F) const {
if (!UseGPUDA)
return false;

// GPUDivergenceAnalysis requires a reducible CFG.
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
using RPOTraversal = ReversePostOrderTraversal<const Function *>;
RPOTraversal FuncRPOT(&F);
return !containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
const LoopInfo>(FuncRPOT, LI);
}

bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
if (TTIWP == nullptr)
Expand All @@ -295,44 +321,67 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
return false;

DivergentValues.clear();
gpuDA = nullptr;

auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
DivergencePropagator DP(F, TTI,
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
PDT, DivergentValues);
DP.populateWithSourcesOfDivergence();
DP.propagate();
LLVM_DEBUG(
dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
print(dbgs(), F.getParent())
);

if (shouldUseGPUDivergenceAnalysis(F)) {
// run the new GPU divergence analysis
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
gpuDA = llvm::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);

} else {
// run LLVM's existing DivergenceAnalysis
DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues);
DP.populateWithSourcesOfDivergence();
DP.propagate();
}

LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName()
<< ":\n";
print(dbgs(), F.getParent()));

return false;
}

bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const {
if (gpuDA) {
return gpuDA->isDivergent(*V);
}
return DivergentValues.count(V);
}

void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
if (DivergentValues.empty())
if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
return;
const Value *FirstDivergentValue = *DivergentValues.begin();

const Function *F;
if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
F = Arg->getParent();
} else if (const Instruction *I =
dyn_cast<Instruction>(FirstDivergentValue)) {
F = I->getParent()->getParent();
} else {
llvm_unreachable("Only arguments and instructions can be divergent");
if (!DivergentValues.empty()) {
const Value *FirstDivergentValue = *DivergentValues.begin();
if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
F = Arg->getParent();
} else if (const Instruction *I =
dyn_cast<Instruction>(FirstDivergentValue)) {
F = I->getParent()->getParent();
} else {
llvm_unreachable("Only arguments and instructions can be divergent");
}
} else if (gpuDA) {
F = &gpuDA->getFunction();
}

// Dumps all divergent values in F, arguments and then instructions.
for (auto &Arg : F->args()) {
OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " ");
OS << (isDivergent(&Arg) ? "DIVERGENT: " : " ");
OS << Arg << "\n";
}
// Iterate instructions using instructions() to ensure a deterministic order.
for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) {
auto &BB = *BI;
OS << "\n " << BB.getName() << ":\n";
for (auto &I : BB.instructionsWithoutDebug()) {
OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " ");
OS << (isDivergent(&I) ? "DIVERGENT: " : " ");
OS << I << "\n";
}
}
Expand Down
14 changes: 14 additions & 0 deletions llvm/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s

define amdgpu_kernel void @workitem_id_x() #1 {
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
%first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.readfirstlane(i32) #0

attributes #0 = { nounwind readnone }
45 changes: 45 additions & 0 deletions llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s

; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
define i32 @test1(i32* %ptr, i32 %val) #0 {
%orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
ret i32 %orig
}

; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) {
%orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
ret {i32, i1} %orig
}

; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
%ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
ret i32 %ret
}

; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
%ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
ret i64 %ret
}

; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
%ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
ret i32 %ret
}

; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
%ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
ret i64 %ret
}

declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind argmemonly }
26 changes: 26 additions & 0 deletions llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s

define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 {
; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge'
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cond.var = icmp slt i32 %tid, 0
br i1 %cond.var, label %B, label %C ; divergent
; CHECK: DIVERGENT: br i1 %cond.var,
B:
%cond.uni = icmp slt i32 %n, 0
br i1 %cond.uni, label %C, label %merge ; uniform
; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
C:
%phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ]
; CHECK: DIVERGENT: %phi.var.hidden = phi i32
br label %merge
merge:
%phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
; CHECK: DIVERGENT: %phi.ipd = phi i32
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone }
Loading

0 comments on commit 56d0ed2

Please sign in to comment.