LLVM  8.0.1
DivergenceAnalysis.cpp
Go to the documentation of this file.
1 //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a general divergence analysis for loop vectorization
11 // and GPU programs. It determines which branches and values in a loop or GPU
12 // program are divergent. It can help branch optimizations such as jump
13 // threading and loop unswitching to make better decisions.
14 //
15 // GPU programs typically use the SIMD execution model, where multiple threads
16 // in the same execution group have to execute in lock-step. Therefore, if the
17 // code contains divergent branches (i.e., threads in a group do not agree on
18 // which path of the branch to take), the group of threads has to execute all
19 // the paths from that branch with different subsets of threads enabled until
20 // they re-converge.
21 //
22 // Due to this execution model, some optimizations such as jump
23 // threading and loop unswitching can interfere with thread re-convergence.
24 // Therefore, an analysis that computes which branches in a GPU program are
25 // divergent can help the compiler to selectively run these optimizations.
26 //
27 // This implementation is derived from the Vectorization Analysis of the
28 // Region Vectorizer (RV). That implementation in turn is based on the approach
29 // described in
30 //
31 // Improving Performance of OpenCL on CPUs
32 // Ralf Karrenberg and Sebastian Hack
33 // CC '12
34 //
35 // This DivergenceAnalysis implementation is generic in the sense that it does
36 // not itself identify original sources of divergence.
37 // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
38 // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
39 // (e.g., special variables that hold the thread ID or the iteration variable).
40 //
41 // The generic implementation propagates divergence to variables that are data
42 // or sync dependent on a source of divergence.
43 //
44 // While data dependency is a well-known concept, the notion of sync dependency
45 // is worth more explanation. Sync dependence characterizes the control flow
46 // aspect of the propagation of branch divergence. For example,
47 //
48 // %cond = icmp slt i32 %tid, 10
49 // br i1 %cond, label %then, label %else
50 // then:
51 // br label %merge
52 // else:
53 // br label %merge
54 // merge:
55 // %a = phi i32 [ 0, %then ], [ 1, %else ]
56 //
57 // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
58 // because %tid is not on its use-def chains, %a is sync dependent on %tid
59 // because the branch "br i1 %cond" depends on %tid and affects which value %a
60 // is assigned to.
61 //
62 // The sync dependence detection (which branch induces divergence in which join
63 // points) is implemented in the SyncDependenceAnalysis.
64 //
65 // The current DivergenceAnalysis implementation has the following limitations:
66 // 1. intra-procedural. It conservatively considers the arguments of a
67 // non-kernel-entry function and the return value of a function call as
68 // divergent.
69 // 2. memory as black box. It conservatively considers values loaded from
70 // generic or local address as divergent. This can be improved by leveraging
71 // pointer analysis and/or by modelling non-escaping memory objects in SSA
72 // as done in RV.
73 //
74 //===----------------------------------------------------------------------===//
75 
77 #include "llvm/Analysis/LoopInfo.h"
78 #include "llvm/Analysis/Passes.h"
81 #include "llvm/IR/Dominators.h"
82 #include "llvm/IR/InstIterator.h"
83 #include "llvm/IR/Instructions.h"
84 #include "llvm/IR/IntrinsicInst.h"
85 #include "llvm/IR/Value.h"
86 #include "llvm/Support/Debug.h"
88 #include <vector>
89 
90 using namespace llvm;
91 
92 #define DEBUG_TYPE "divergence-analysis"
93 
94 // class DivergenceAnalysis
96  const Function &F, const Loop *RegionLoop, const DominatorTree &DT,
97  const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
98  : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
99  IsLCSSAForm(IsLCSSAForm) {}
100 
102  assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
103  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
104  DivergentValues.insert(&DivVal);
105 }
106 
108  UniformOverrides.insert(&UniVal);
109 }
110 
111 bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const {
112  if (Term.getNumSuccessors() <= 1)
113  return false;
114  if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
115  assert(BranchTerm->isConditional());
116  return isDivergent(*BranchTerm->getCondition());
117  }
118  if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
119  return isDivergent(*SwitchTerm->getCondition());
120  }
121  if (isa<InvokeInst>(Term)) {
122  return false; // ignore abnormal executions through landingpad
123  }
124 
125  llvm_unreachable("unexpected terminator");
126 }
127 
128 bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
129  // TODO function calls with side effects, etc
130  for (const auto &Op : I.operands()) {
131  if (isDivergent(*Op))
132  return true;
133  }
134  return false;
135 }
136 
137 bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
138  const Value &Val) const {
139  const auto *Inst = dyn_cast<const Instruction>(&Val);
140  if (!Inst)
141  return false;
142  // check whether any divergent loop carrying Val terminates before control
143  // proceeds to ObservingBlock
144  for (const auto *Loop = LI.getLoopFor(Inst->getParent());
145  Loop != RegionLoop && !Loop->contains(&ObservingBlock);
146  Loop = Loop->getParentLoop()) {
147  if (DivergentLoops.find(Loop) != DivergentLoops.end())
148  return true;
149  }
150 
151  return false;
152 }
153 
154 bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
155  // joining divergent disjoint path in Phi parent block
156  if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
157  return true;
158  }
159 
160  // An incoming value could be divergent by itself.
161  // Otherwise, an incoming value could be uniform within the loop
162  // that carries its definition but it may appear divergent
163  // from outside the loop. This happens when divergent loop exits
164  // drop definitions of that uniform value in different iterations.
165  //
166  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
167  // if (i % thread_id == 0) break; // divergent loop exit
168  // }
169  // int divI = i; // divI is divergent
170  for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
171  const auto *InVal = Phi.getIncomingValue(i);
172  if (isDivergent(*Phi.getIncomingValue(i)) ||
173  isTemporalDivergent(*Phi.getParent(), *InVal)) {
174  return true;
175  }
176  }
177  return false;
178 }
179 
181  return I.getParent() && inRegion(*I.getParent());
182 }
183 
185  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
186 }
187 
188 // marks all users of loop-carried values of the loop headed by LoopHeader as
189 // divergent
190 void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
191  auto *DivLoop = LI.getLoopFor(&LoopHeader);
192  assert(DivLoop && "loopHeader is not actually part of a loop");
193 
194  SmallVector<BasicBlock *, 8> TaintStack;
195  DivLoop->getExitBlocks(TaintStack);
196 
197  // Otherwise potential users of loop-carried values could be anywhere in the
198  // dominance region of DivLoop (including its fringes for phi nodes)
200  for (auto *Block : TaintStack) {
201  Visited.insert(Block);
202  }
203  Visited.insert(&LoopHeader);
204 
205  while (!TaintStack.empty()) {
206  auto *UserBlock = TaintStack.back();
207  TaintStack.pop_back();
208 
209  // don't spread divergence beyond the region
210  if (!inRegion(*UserBlock))
211  continue;
212 
213  assert(!DivLoop->contains(UserBlock) &&
214  "irreducible control flow detected");
215 
216  // phi nodes at the fringes of the dominance region
217  if (!DT.dominates(&LoopHeader, UserBlock)) {
218  // all PHI nodes of UserBlock become divergent
219  for (auto &Phi : UserBlock->phis()) {
220  Worklist.push_back(&Phi);
221  }
222  continue;
223  }
224 
225  // taint outside users of values carried by DivLoop
226  for (auto &I : *UserBlock) {
227  if (isAlwaysUniform(I))
228  continue;
229  if (isDivergent(I))
230  continue;
231 
232  for (auto &Op : I.operands()) {
233  auto *OpInst = dyn_cast<Instruction>(&Op);
234  if (!OpInst)
235  continue;
236  if (DivLoop->contains(OpInst->getParent())) {
237  markDivergent(I);
238  pushUsers(I);
239  break;
240  }
241  }
242  }
243 
244  // visit all blocks in the dominance region
245  for (auto *SuccBlock : successors(UserBlock)) {
246  if (!Visited.insert(SuccBlock).second) {
247  continue;
248  }
249  TaintStack.push_back(SuccBlock);
250  }
251  }
252 }
253 
254 void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
255  for (const auto &Phi : Block.phis()) {
256  if (isDivergent(Phi))
257  continue;
258  Worklist.push_back(&Phi);
259  }
260 }
261 
262 void DivergenceAnalysis::pushUsers(const Value &V) {
263  for (const auto *User : V.users()) {
264  const auto *UserInst = dyn_cast<const Instruction>(User);
265  if (!UserInst)
266  continue;
267 
268  if (isDivergent(*UserInst))
269  continue;
270 
271  // only compute divergent inside loop
272  if (!inRegion(*UserInst))
273  continue;
274  Worklist.push_back(UserInst);
275  }
276 }
277 
278 bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
279  const Loop *BranchLoop) {
280  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
281 
282  // ignore divergence outside the region
283  if (!inRegion(JoinBlock)) {
284  return false;
285  }
286 
287  // push non-divergent phi nodes in JoinBlock to the worklist
288  pushPHINodes(JoinBlock);
289 
290  // JoinBlock is a divergent loop exit
291  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
292  return true;
293  }
294 
295  // disjoint-paths divergent at JoinBlock
296  markBlockJoinDivergent(JoinBlock);
297  return false;
298 }
299 
300 void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
301  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
302 
303  markDivergent(Term);
304 
305  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
306 
307  // whether there is a divergent loop exit from BranchLoop (if any)
308  bool IsBranchLoopDivergent = false;
309 
310  // iterate over all blocks reachable by disjoint from Term within the loop
311  // also iterates over loop exits that become divergent due to Term.
312  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
313  IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
314  }
315 
316  // Branch loop is a divergent loop due to the divergent branch in Term
317  if (IsBranchLoopDivergent) {
318  assert(BranchLoop);
319  if (!DivergentLoops.insert(BranchLoop).second) {
320  return;
321  }
322  propagateLoopDivergence(*BranchLoop);
323  }
324 }
325 
326 void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
327  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
328 
329  // don't propagate beyond region
330  if (!inRegion(*ExitingLoop.getHeader()))
331  return;
332 
333  const auto *BranchLoop = ExitingLoop.getParentLoop();
334 
335  // Uses of loop-carried values could occur anywhere
336  // within the dominance region of the definition. All loop-carried
337  // definitions are dominated by the loop header (reducible control).
338  // Thus all users have to be in the dominance region of the loop header,
339  // except PHI nodes that can also live at the fringes of the dom region
340  // (incoming defining value).
341  if (!IsLCSSAForm)
342  taintLoopLiveOuts(*ExitingLoop.getHeader());
343 
344  // whether there is a divergent loop exit from BranchLoop (if any)
345  bool IsBranchLoopDivergent = false;
346 
347  // iterate over all blocks reachable by disjoint paths from exits of
348  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
349  // become divergent.
350  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
351  IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
352  }
353 
354  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
355  if (IsBranchLoopDivergent) {
356  assert(BranchLoop);
357  if (!DivergentLoops.insert(BranchLoop).second) {
358  return;
359  }
360  propagateLoopDivergence(*BranchLoop);
361  }
362 }
363 
365  for (auto *DivVal : DivergentValues) {
366  pushUsers(*DivVal);
367  }
368 
369  // propagate divergence
370  while (!Worklist.empty()) {
371  const Instruction &I = *Worklist.back();
372  Worklist.pop_back();
373 
374  // maintain uniformity of overrides
375  if (isAlwaysUniform(I))
376  continue;
377 
378  bool WasDivergent = isDivergent(I);
379  if (WasDivergent)
380  continue;
381 
382  // propagate divergence caused by terminator
383  if (I.isTerminator()) {
384  if (updateTerminator(I)) {
385  // propagate control divergence to affected instructions
386  propagateBranchDivergence(I);
387  continue;
388  }
389  }
390 
391  // update divergence of I due to divergent operands
392  bool DivergentUpd = false;
393  const auto *Phi = dyn_cast<const PHINode>(&I);
394  if (Phi) {
395  DivergentUpd = updatePHINode(*Phi);
396  } else {
397  DivergentUpd = updateNormalInstruction(I);
398  }
399 
400  // propagate value divergence to users
401  if (DivergentUpd) {
402  markDivergent(I);
403  pushUsers(I);
404  }
405  }
406 }
407 
409  return UniformOverrides.find(&V) != UniformOverrides.end();
410 }
411 
413  return DivergentValues.find(&V) != DivergentValues.end();
414 }
415 
416 void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
417  if (DivergentValues.empty())
418  return;
419  // iterate instructions using instructions() to ensure a deterministic order.
420  for (auto &I : instructions(F)) {
421  if (isDivergent(I))
422  OS << "DIVERGENT:" << I << '\n';
423  }
424 }
425 
426 // class GPUDivergenceAnalysis
428  const DominatorTree &DT,
429  const PostDominatorTree &PDT,
430  const LoopInfo &LI,
431  const TargetTransformInfo &TTI)
432  : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
433  for (auto &I : instructions(F)) {
434  if (TTI.isSourceOfDivergence(&I)) {
435  DA.markDivergent(I);
436  } else if (TTI.isAlwaysUniform(&I)) {
437  DA.addUniformOverride(I);
438  }
439  }
440  for (auto &Arg : F.args()) {
441  if (TTI.isSourceOfDivergence(&Arg)) {
442  DA.markDivergent(Arg);
443  }
444  }
445 
446  DA.compute();
447 }
448 
450  return DA.isDivergent(val);
451 }
452 
453 void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
454  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
455  DA.print(OS, mod);
456  OS << "}\n";
457 }
This class represents lattice values for constants.
Definition: AllocatorList.h:24
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
void addUniformOverride(const Value &UniVal)
Mark UniVal as a value that is always uniform.
Implements a dense probed hash-table based set.
Definition: DenseSet.h:250
bool isTerminator() const
Definition: Instruction.h:129
const ConstBlockSet & join_blocks(const Instruction &Term)
Computes divergent join points and loop exits caused by branch divergence in Term.
F(f)
DivergenceAnalysis(const Function &F, const Loop *RegionLoop, const DominatorTree &DT, const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
This instance will analyze the whole function F or the loop RegionLoop.
void compute()
Propagate divergence to all instructions in the region.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:690
bool isDivergent(const Value &V) const
Whether V is divergent.
const Function & getFunction() const
BlockT * getHeader() const
Definition: LoopInfo.h:100
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
bool isAlwaysUniform(const Value *V) const
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:188
Relates points of divergent control to join points in reducible CFGs.
op_range operands()
Definition: User.h:238
GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, const PostDominatorTree &PDT, const LoopInfo &LI, const TargetTransformInfo &TTI)
Runs the divergence analysis on , a GPU kernel.
bool hasConstantOrUndefValue() const
Whether the specified PHI node always merges together the same value, assuming undefs are equal to a ...
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:249
bool isSourceOfDivergence(const Value *V) const
Returns whether V is a source of divergence.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
iterator_range< user_iterator > users()
Definition: Value.h:400
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
amdgpu Simplify well known AMD library false Value Value * Arg
LoopT * getParentLoop() const
Definition: LoopInfo.h:101
StringRef getName() const
Definition: LoopInfo.h:589
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
#define I(x, y, z)
Definition: MD5.cpp:58
bool inRegion(const BasicBlock &BB) const
Whether BB is part of the region.
bool isDivergent(const Value &Val) const
Whether Val is a divergent value.
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:166
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:325
bool isAlwaysUniform(const Value &Val) const
Whether Val will always return a uniform value regardless of its operands.
void markDivergent(const Value &DivVal)
Mark DivVal as a value that is always divergent.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
succ_range successors(Instruction *I)
Definition: CFG.h:264
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
inst_range instructions(Function *F)
Definition: InstIterator.h:134
This pass exposes codegen information to IR-level passes.
void print(raw_ostream &OS, const Module *) const
#define LLVM_DEBUG(X)
Definition: Debug.h:123
void print(raw_ostream &OS, const Module *) const
Print all divergent values in the kernel.
iterator_range< arg_iterator > args()
Definition: Function.h:689
const BasicBlock * getParent() const
Definition: Instruction.h:67