LLVM  8.0.1
LoopUnrollAndJam.cpp
Go to the documentation of this file.
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements loop unroll and jam as a routine, much like
11 // LoopUnroll.cpp implements loop unroll.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/LoopPass.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/LLVMContext.h"
33 #include "llvm/Support/Debug.h"
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "loop-unroll-and-jam"
44 
45 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47 
49 
50 // Partition blocks in an outer/inner loop pair into blocks before and after
51 // the loop
52 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
53  BasicBlockSet &ForeBlocks,
54  BasicBlockSet &SubLoopBlocks,
55  BasicBlockSet &AftBlocks,
56  DominatorTree *DT) {
57  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
58  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
59 
60  for (BasicBlock *BB : L->blocks()) {
61  if (!SubLoop->contains(BB)) {
62  if (DT->dominates(SubLoopLatch, BB))
63  AftBlocks.insert(BB);
64  else
65  ForeBlocks.insert(BB);
66  }
67  }
68 
69  // Check that all blocks in ForeBlocks together dominate the subloop
70  // TODO: This might ideally be done better with a dominator/postdominators.
71  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72  for (BasicBlock *BB : ForeBlocks) {
73  if (BB == SubLoopPreHeader)
74  continue;
75  Instruction *TI = BB->getTerminator();
76  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
77  if (!ForeBlocks.count(TI->getSuccessor(i)))
78  return false;
79  }
80 
81  return true;
82 }
83 
84 // Looks at the phi nodes in Header for values coming from Latch. For these
85 // instructions and all their operands calls Visit on them, keeping going for
86 // all the operands in AftBlocks. Returns false if Visit returns false,
87 // otherwise returns true. This is used to process the instructions in the
88 // Aft blocks that need to be moved before the subloop. It is used in two
89 // places. One to check that the required set of instructions can be moved
90 // before the loop. Then to collect the instructions to actually move in
91 // moveHeaderPhiOperandsToForeBlocks.
92 template <typename T>
93 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
94  BasicBlockSet &AftBlocks, T Visit) {
96  for (auto &Phi : Header->phis()) {
97  Value *V = Phi.getIncomingValueForBlock(Latch);
98  if (Instruction *I = dyn_cast<Instruction>(V))
99  Worklist.push_back(I);
100  }
101 
102  while (!Worklist.empty()) {
103  Instruction *I = Worklist.back();
104  Worklist.pop_back();
105  if (!Visit(I))
106  return false;
107 
108  if (AftBlocks.count(I->getParent()))
109  for (auto &U : I->operands())
110  if (Instruction *II = dyn_cast<Instruction>(U))
111  Worklist.push_back(II);
112  }
113 
114  return true;
115 }
116 
117 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
119  BasicBlock *Latch,
120  Instruction *InsertLoc,
121  BasicBlockSet &AftBlocks) {
122  // We need to ensure we move the instructions in the correct order,
123  // starting with the earliest required instruction and moving forward.
124  std::vector<Instruction *> Visited;
125  processHeaderPhiOperands(Header, Latch, AftBlocks,
126  [&Visited, &AftBlocks](Instruction *I) {
127  if (AftBlocks.count(I->getParent()))
128  Visited.push_back(I);
129  return true;
130  });
131 
132  // Move all instructions in program order to before the InsertLoc
133  BasicBlock *InsertLocBB = InsertLoc->getParent();
134  for (Instruction *I : reverse(Visited)) {
135  if (I->getParent() != InsertLocBB)
136  I->moveBefore(InsertLoc);
137  }
138 }
139 
140 /*
141  This method performs Unroll and Jam. For a simple loop like:
142  for (i = ..)
143  Fore(i)
144  for (j = ..)
145  SubLoop(i, j)
146  Aft(i)
147 
148  Instead of doing normal inner or outer unrolling, we do:
149  for (i = .., i+=2)
150  Fore(i)
151  Fore(i+1)
152  for (j = ..)
153  SubLoop(i, j)
154  SubLoop(i+1, j)
155  Aft(i)
156  Aft(i+1)
157 
158  So the outer loop is essetially unrolled and then the inner loops are fused
159  ("jammed") together into a single loop. This can increase speed when there
160  are loads in SubLoop that are invariant to i, as they become shared between
161  the now jammed inner loops.
162 
163  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
164  Fore blocks are those before the inner loop, Aft are those after. Normal
165  Unroll code is used to copy each of these sets of blocks and the results are
166  combined together into the final form above.
167 
168  isSafeToUnrollAndJam should be used prior to calling this to make sure the
169  unrolling will be valid. Checking profitablility is also advisable.
170 
171  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
172  necessary to create one and not fully unrolled).
173 */
175  Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
176  bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
177  AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
178 
179  // When we enter here we should have already checked that it is safe
180  BasicBlock *Header = L->getHeader();
181  assert(L->getSubLoops().size() == 1);
182  Loop *SubLoop = *L->begin();
183 
184  // Don't enter the unroll code if there is nothing to do.
185  if (TripCount == 0 && Count < 2) {
186  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
188  }
189 
190  assert(Count > 0);
191  assert(TripMultiple > 0);
192  assert(TripCount == 0 || TripCount % TripMultiple == 0);
193 
194  // Are we eliminating the loop control altogether?
195  bool CompletelyUnroll = (Count == TripCount);
196 
197  // We use the runtime remainder in cases where we don't know trip multiple
198  if (TripMultiple == 1 || TripMultiple % Count != 0) {
199  if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
200  /*UseEpilogRemainder*/ true,
201  UnrollRemainder, LI, SE, DT, AC, true,
202  EpilogueLoop)) {
203  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
204  "generated when assuming runtime trip count\n");
206  }
207  }
208 
209  // Notify ScalarEvolution that the loop will be substantially changed,
210  // if not outright eliminated.
211  if (SE) {
212  SE->forgetLoop(L);
213  SE->forgetLoop(SubLoop);
214  }
215 
216  using namespace ore;
217  // Report the unrolling decision.
218  if (CompletelyUnroll) {
219  LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
220  << Header->getName() << " with trip count " << TripCount
221  << "!\n");
222  ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
223  L->getHeader())
224  << "completely unroll and jammed loop with "
225  << NV("UnrollCount", TripCount) << " iterations");
226  } else {
227  auto DiagBuilder = [&]() {
228  OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
229  L->getHeader());
230  return Diag << "unroll and jammed loop by a factor of "
231  << NV("UnrollCount", Count);
232  };
233 
234  LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
235  << " by " << Count);
236  if (TripMultiple != 1) {
237  LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
238  ORE->emit([&]() {
239  return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
240  << " trips per branch";
241  });
242  } else {
243  LLVM_DEBUG(dbgs() << " with run-time trip count");
244  ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
245  }
246  LLVM_DEBUG(dbgs() << "!\n");
247  }
248 
249  BasicBlock *Preheader = L->getLoopPreheader();
250  BasicBlock *LatchBlock = L->getLoopLatch();
251  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
252  assert(Preheader && LatchBlock && Header);
253  assert(BI && !BI->isUnconditional());
254  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
255  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
256  bool SubLoopContinueOnTrue = SubLoop->contains(
257  SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
258 
259  // Partition blocks in an outer/inner loop pair into blocks before and after
260  // the loop
261  BasicBlockSet SubLoopBlocks;
262  BasicBlockSet ForeBlocks;
263  BasicBlockSet AftBlocks;
264  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
265  DT);
266 
267  // We keep track of the entering/first and exiting/last block of each of
268  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
269  // blocks easier.
270  std::vector<BasicBlock *> ForeBlocksFirst;
271  std::vector<BasicBlock *> ForeBlocksLast;
272  std::vector<BasicBlock *> SubLoopBlocksFirst;
273  std::vector<BasicBlock *> SubLoopBlocksLast;
274  std::vector<BasicBlock *> AftBlocksFirst;
275  std::vector<BasicBlock *> AftBlocksLast;
276  ForeBlocksFirst.push_back(Header);
277  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
278  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
279  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
280  AftBlocksFirst.push_back(SubLoop->getExitBlock());
281  AftBlocksLast.push_back(L->getExitingBlock());
282  // Maps Blocks[0] -> Blocks[It]
283  ValueToValueMapTy LastValueMap;
284 
285  // Move any instructions from fore phi operands from AftBlocks into Fore.
287  Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
288  AftBlocks);
289 
290  // The current on-the-fly SSA update requires blocks to be processed in
291  // reverse postorder so that LastValueMap contains the correct value at each
292  // exit.
293  LoopBlocksDFS DFS(L);
294  DFS.perform(LI);
295  // Stash the DFS iterators before adding blocks to the loop.
296  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
297  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
298 
299  if (Header->getParent()->isDebugInfoForProfiling())
300  for (BasicBlock *BB : L->getBlocks())
301  for (Instruction &I : *BB)
302  if (!isa<DbgInfoIntrinsic>(&I))
303  if (const DILocation *DIL = I.getDebugLoc()) {
304  auto NewDIL = DIL->cloneWithDuplicationFactor(Count);
305  if (NewDIL)
306  I.setDebugLoc(NewDIL.getValue());
307  else
308  LLVM_DEBUG(dbgs()
309  << "Failed to create new discriminator: "
310  << DIL->getFilename() << " Line: " << DIL->getLine());
311  }
312 
313  // Copy all blocks
314  for (unsigned It = 1; It != Count; ++It) {
315  std::vector<BasicBlock *> NewBlocks;
316  // Maps Blocks[It] -> Blocks[It-1]
317  DenseMap<Value *, Value *> PrevItValueMap;
318 
319  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
320  ValueToValueMapTy VMap;
321  BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
322  Header->getParent()->getBasicBlockList().push_back(New);
323 
324  if (ForeBlocks.count(*BB)) {
325  L->addBasicBlockToLoop(New, *LI);
326 
327  if (*BB == ForeBlocksFirst[0])
328  ForeBlocksFirst.push_back(New);
329  if (*BB == ForeBlocksLast[0])
330  ForeBlocksLast.push_back(New);
331  } else if (SubLoopBlocks.count(*BB)) {
332  SubLoop->addBasicBlockToLoop(New, *LI);
333 
334  if (*BB == SubLoopBlocksFirst[0])
335  SubLoopBlocksFirst.push_back(New);
336  if (*BB == SubLoopBlocksLast[0])
337  SubLoopBlocksLast.push_back(New);
338  } else if (AftBlocks.count(*BB)) {
339  L->addBasicBlockToLoop(New, *LI);
340 
341  if (*BB == AftBlocksFirst[0])
342  AftBlocksFirst.push_back(New);
343  if (*BB == AftBlocksLast[0])
344  AftBlocksLast.push_back(New);
345  } else {
346  llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
347  }
348 
349  // Update our running maps of newest clones
350  PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
351  LastValueMap[*BB] = New;
352  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
353  VI != VE; ++VI) {
354  PrevItValueMap[VI->second] =
355  const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
356  LastValueMap[VI->first] = VI->second;
357  }
358 
359  NewBlocks.push_back(New);
360 
361  // Update DomTree:
362  if (*BB == ForeBlocksFirst[0])
363  DT->addNewBlock(New, ForeBlocksLast[It - 1]);
364  else if (*BB == SubLoopBlocksFirst[0])
365  DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
366  else if (*BB == AftBlocksFirst[0])
367  DT->addNewBlock(New, AftBlocksLast[It - 1]);
368  else {
369  // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
370  // structure.
371  auto BBDomNode = DT->getNode(*BB);
372  auto BBIDom = BBDomNode->getIDom();
373  BasicBlock *OriginalBBIDom = BBIDom->getBlock();
374  assert(OriginalBBIDom);
375  assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
376  DT->addNewBlock(
377  New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
378  }
379  }
380 
381  // Remap all instructions in the most recent iteration
382  for (BasicBlock *NewBlock : NewBlocks) {
383  for (Instruction &I : *NewBlock) {
384  ::remapInstruction(&I, LastValueMap);
385  if (auto *II = dyn_cast<IntrinsicInst>(&I))
386  if (II->getIntrinsicID() == Intrinsic::assume)
387  AC->registerAssumption(II);
388  }
389  }
390 
391  // Alter the ForeBlocks phi's, pointing them at the latest version of the
392  // value from the previous iteration's phis
393  for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
394  Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
395  assert(OldValue && "should have incoming edge from Aft[It]");
396  Value *NewValue = OldValue;
397  if (Value *PrevValue = PrevItValueMap[OldValue])
398  NewValue = PrevValue;
399 
400  assert(Phi.getNumOperands() == 2);
401  Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
402  Phi.setIncomingValue(0, NewValue);
403  Phi.removeIncomingValue(1);
404  }
405  }
406 
407  // Now that all the basic blocks for the unrolled iterations are in place,
408  // finish up connecting the blocks and phi nodes. At this point LastValueMap
409  // is the last unrolled iterations values.
410 
411  // Update Phis in BB from OldBB to point to NewBB
412  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
413  BasicBlock *NewBB) {
414  for (PHINode &Phi : BB->phis()) {
415  int I = Phi.getBasicBlockIndex(OldBB);
416  Phi.setIncomingBlock(I, NewBB);
417  }
418  };
419  // Update Phis in BB from OldBB to point to NewBB and use the latest value
420  // from LastValueMap
421  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
422  BasicBlock *NewBB,
423  ValueToValueMapTy &LastValueMap) {
424  for (PHINode &Phi : BB->phis()) {
425  for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
426  if (Phi.getIncomingBlock(b) == OldBB) {
427  Value *OldValue = Phi.getIncomingValue(b);
428  if (Value *LastValue = LastValueMap[OldValue])
429  Phi.setIncomingValue(b, LastValue);
430  Phi.setIncomingBlock(b, NewBB);
431  break;
432  }
433  }
434  }
435  };
436  // Move all the phis from Src into Dest
437  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
438  Instruction *insertPoint = Dest->getFirstNonPHI();
439  while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
440  Phi->moveBefore(insertPoint);
441  };
442 
443  // Update the PHI values outside the loop to point to the last block
444  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
445  LastValueMap);
446 
447  // Update ForeBlocks successors and phi nodes
448  BranchInst *ForeTerm =
449  cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
450  BasicBlock *Dest = SubLoopBlocksFirst[0];
451  ForeTerm->setSuccessor(0, Dest);
452 
453  if (CompletelyUnroll) {
454  while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
455  Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
456  Phi->getParent()->getInstList().erase(Phi);
457  }
458  } else {
459  // Update the PHI values to point to the last aft block
460  updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
461  AftBlocksLast.back(), LastValueMap);
462  }
463 
464  for (unsigned It = 1; It != Count; It++) {
465  // Remap ForeBlock successors from previous iteration to this
466  BranchInst *ForeTerm =
467  cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
468  BasicBlock *Dest = ForeBlocksFirst[It];
469  ForeTerm->setSuccessor(0, Dest);
470  }
471 
472  // Subloop successors and phis
473  BranchInst *SubTerm =
474  cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
475  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
476  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
477  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
478  ForeBlocksLast.back());
479  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
480  SubLoopBlocksLast.back());
481 
482  for (unsigned It = 1; It != Count; It++) {
483  // Replace the conditional branch of the previous iteration subloop with an
484  // unconditional one to this one
485  BranchInst *SubTerm =
486  cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
487  BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
488  SubTerm->eraseFromParent();
489 
490  updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
491  ForeBlocksLast.back());
492  updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
493  SubLoopBlocksLast.back());
494  movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
495  }
496 
497  // Aft blocks successors and phis
498  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
499  if (CompletelyUnroll) {
500  BranchInst::Create(LoopExit, Term);
501  Term->eraseFromParent();
502  } else {
503  Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
504  }
505  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
506  SubLoopBlocksLast.back());
507 
508  for (unsigned It = 1; It != Count; It++) {
509  // Replace the conditional branch of the previous iteration subloop with an
510  // unconditional one to this one
511  BranchInst *AftTerm =
512  cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
513  BranchInst::Create(AftBlocksFirst[It], AftTerm);
514  AftTerm->eraseFromParent();
515 
516  updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
517  SubLoopBlocksLast.back());
518  movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
519  }
520 
521  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
522  // new ones required.
523  if (Count != 1) {
525  DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
526  SubLoopBlocksFirst[0]);
528  SubLoopBlocksLast[0], AftBlocksFirst[0]);
529 
531  ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
533  SubLoopBlocksLast.back(), AftBlocksFirst[0]);
534  DT->applyUpdates(DTUpdates);
535  }
536 
537  // Merge adjacent basic blocks, if possible.
538  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
539  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
540  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
541  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
542  while (!MergeBlocks.empty()) {
543  BasicBlock *BB = *MergeBlocks.begin();
545  if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
546  BasicBlock *Dest = Term->getSuccessor(0);
547  if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
548  // Don't remove BB and add Fold as they are the same BB
549  assert(Fold == BB);
550  (void)Fold;
551  MergeBlocks.erase(Dest);
552  } else
553  MergeBlocks.erase(BB);
554  } else
555  MergeBlocks.erase(BB);
556  }
557 
558  // At this point, the code is well formed. We now do a quick sweep over the
559  // inserted code, doing constant propagation and dead code elimination as we
560  // go.
561  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
562  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
563 
564  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
565  ++NumUnrolledAndJammed;
566 
567 #ifndef NDEBUG
568  // We shouldn't have done anything to break loop simplify form or LCSSA.
569  Loop *OuterL = L->getParentLoop();
570  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
571  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
572  if (!CompletelyUnroll)
574  assert(SubLoop->isLoopSimplifyForm());
575  assert(DT->verify());
576 #endif
577 
578  // Update LoopInfo if the loop is completely removed.
579  if (CompletelyUnroll)
580  LI->erase(L);
581 
582  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
584 }
585 
586 static bool getLoadsAndStores(BasicBlockSet &Blocks,
587  SmallVector<Value *, 4> &MemInstr) {
588  // Scan the BBs and collect legal loads and stores.
589  // Returns false if non-simple loads/stores are found.
590  for (BasicBlock *BB : Blocks) {
591  for (Instruction &I : *BB) {
592  if (auto *Ld = dyn_cast<LoadInst>(&I)) {
593  if (!Ld->isSimple())
594  return false;
595  MemInstr.push_back(&I);
596  } else if (auto *St = dyn_cast<StoreInst>(&I)) {
597  if (!St->isSimple())
598  return false;
599  MemInstr.push_back(&I);
600  } else if (I.mayReadOrWriteMemory()) {
601  return false;
602  }
603  }
604  }
605  return true;
606 }
607 
610  unsigned LoopDepth, bool InnerLoop,
611  DependenceInfo &DI) {
612  // Use DA to check for dependencies between loads and stores that make unroll
613  // and jam invalid
614  for (Value *I : Earlier) {
615  for (Value *J : Later) {
616  Instruction *Src = cast<Instruction>(I);
617  Instruction *Dst = cast<Instruction>(J);
618  if (Src == Dst)
619  continue;
620  // Ignore Input dependencies.
621  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
622  continue;
623 
624  // Track dependencies, and if we find them take a conservative approach
625  // by allowing only = or < (not >), altough some > would be safe
626  // (depending upon unroll width).
627  // For the inner loop, we need to disallow any (> <) dependencies
628  // FIXME: Allow > so long as distance is less than unroll width
629  if (auto D = DI.depends(Src, Dst, true)) {
630  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
631 
632  if (D->isConfused()) {
633  LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
634  << " " << *Src << "\n"
635  << " " << *Dst << "\n");
636  return false;
637  }
638  if (!InnerLoop) {
639  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
640  LLVM_DEBUG(dbgs() << " > dependency between:\n"
641  << " " << *Src << "\n"
642  << " " << *Dst << "\n");
643  return false;
644  }
645  } else {
646  assert(LoopDepth + 1 <= D->getLevels());
647  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
648  D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
649  LLVM_DEBUG(dbgs() << " < > dependency between:\n"
650  << " " << *Src << "\n"
651  << " " << *Dst << "\n");
652  return false;
653  }
654  }
655  }
656  }
657  }
658  return true;
659 }
660 
661 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
662  BasicBlockSet &SubLoopBlocks,
663  BasicBlockSet &AftBlocks, DependenceInfo &DI) {
664  // Get all loads/store pairs for each blocks
665  SmallVector<Value *, 4> ForeMemInstr;
666  SmallVector<Value *, 4> SubLoopMemInstr;
667  SmallVector<Value *, 4> AftMemInstr;
668  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
669  !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
670  !getLoadsAndStores(AftBlocks, AftMemInstr))
671  return false;
672 
673  // Check for dependencies between any blocks that may change order
674  unsigned LoopDepth = L->getLoopDepth();
675  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
676  DI) &&
677  checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
678  checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
679  DI) &&
680  checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
681  DI);
682 }
683 
685  DependenceInfo &DI) {
686  /* We currently handle outer loops like this:
687  |
688  ForeFirst <----\ }
689  Blocks | } ForeBlocks
690  ForeLast | }
691  | |
692  SubLoopFirst <\ | }
693  Blocks | | } SubLoopBlocks
694  SubLoopLast -/ | }
695  | |
696  AftFirst | }
697  Blocks | } AftBlocks
698  AftLast ------/ }
699  |
700 
701  There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
702  and AftBlocks, providing that there is one edge from Fores to SubLoops,
703  one edge from SubLoops to Afts and a single outer loop exit (from Afts).
704  In practice we currently limit Aft blocks to a single block, and limit
705  things further in the profitablility checks of the unroll and jam pass.
706 
707  Because of the way we rearrange basic blocks, we also require that
708  the Fore blocks on all unrolled iterations are safe to move before the
709  SubLoop blocks of all iterations. So we require that the phi node looping
710  operands of ForeHeader can be moved to at least the end of ForeEnd, so that
711  we can arrange cloned Fore Blocks before the subloop and match up Phi's
712  correctly.
713 
714  i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
715  It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
716 
717  There are then a number of checks along the lines of no calls, no
718  exceptions, inner loop IV is consistent, etc. Note that for loops requiring
719  runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
720  UnrollAndJamLoop if the trip count cannot be easily calculated.
721  */
722 
723  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
724  return false;
725  Loop *SubLoop = L->getSubLoops()[0];
726  if (!SubLoop->isLoopSimplifyForm())
727  return false;
728 
729  BasicBlock *Header = L->getHeader();
730  BasicBlock *Latch = L->getLoopLatch();
731  BasicBlock *Exit = L->getExitingBlock();
732  BasicBlock *SubLoopHeader = SubLoop->getHeader();
733  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
734  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
735 
736  if (Latch != Exit)
737  return false;
738  if (SubLoopLatch != SubLoopExit)
739  return false;
740 
741  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
742  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
743  return false;
744  }
745 
746  // Split blocks into Fore/SubLoop/Aft based on dominators
747  BasicBlockSet SubLoopBlocks;
748  BasicBlockSet ForeBlocks;
749  BasicBlockSet AftBlocks;
750  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
751  AftBlocks, &DT)) {
752  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
753  return false;
754  }
755 
756  // Aft blocks may need to move instructions to fore blocks, which becomes more
757  // difficult if there are multiple (potentially conditionally executed)
758  // blocks. For now we just exclude loops with multiple aft blocks.
759  if (AftBlocks.size() != 1) {
760  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
761  "multiple blocks after the loop\n");
762  return false;
763  }
764 
765  // Check inner loop backedge count is consistent on all iterations of the
766  // outer loop
767  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
768  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
769  "not consistent on each iteration\n");
770  return false;
771  }
772 
773  // Check the loop safety info for exceptions.
775  LSI.computeLoopSafetyInfo(L);
776  if (LSI.anyBlockMayThrow()) {
777  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
778  return false;
779  }
780 
781  // We've ruled out the easy stuff and now need to check that there are no
782  // interdependencies which may prevent us from moving the:
783  // ForeBlocks before Subloop and AftBlocks.
784  // Subloop before AftBlocks.
785  // ForeBlock phi operands before the subloop
786 
787  // Make sure we can move all instructions we need to before the subloop
789  Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
790  if (SubLoop->contains(I->getParent()))
791  return false;
792  if (AftBlocks.count(I->getParent())) {
793  // If we hit a phi node in afts we know we are done (probably
794  // LCSSA)
795  if (isa<PHINode>(I))
796  return false;
797  // Can't move instructions with side effects or memory
798  // reads/writes
799  if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
800  return false;
801  }
802  // Keep going
803  return true;
804  })) {
805  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
806  "instructions after subloop to before it\n");
807  return false;
808  }
809 
810  // Check for memory dependencies which prohibit the unrolling we are doing.
811  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
812  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
813  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
814  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
815  return false;
816  }
817 
818  return true;
819 }
virtual void computeLoopSafetyInfo(const Loop *CurLoop)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:45
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:225
bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA, Loop **ResultLoop=nullptr)
Insert code in the prolog/epilog code when unrolling a loop with a run-time trip-count.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:250
DiagnosticInfoOptimizationBase::Argument NV
This class represents lattice values for constants.
Definition: AllocatorList.h:24
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
static bool getLoadsAndStores(BasicBlockSet &Blocks, SmallVector< Value *, 4 > &MemInstr)
bool isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const
Return true if this Loop and all inner subloops are in LCSSA form.
Definition: LoopInfo.cpp:184
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:92
The main scalar evolution driver.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:174
A cache of @llvm.assume calls within a function.
BasicBlock * foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT)
Folds a basic block into its predecessor if it only has one predecessor, and that predecessor only ha...
Definition: LoopUnroll.cpp:100
static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, BasicBlockSet &AftBlocks, T Visit)
BasicBlock * getSuccessor(unsigned i) const
bool hasIterationCountInvariantInParent(Loop *L, ScalarEvolution &SE)
Check inner loop (L) backedge count is known to be invariant on all iterations of its outer loop...
Definition: LoopUtils.cpp:649
STATISTIC(NumFunctions, "Total number of functions")
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:138
DependenceInfo - This class is the main dependence-analysis driver.
static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch, Instruction *InsertLoc, BasicBlockSet &AftBlocks)
static bool checkDependencies(SmallVector< Value *, 4 > &Earlier, SmallVector< Value *, 4 > &Later, unsigned LoopDepth, bool InnerLoop, DependenceInfo &DI)
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:269
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:98
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:611
BlockT * getHeader() const
Definition: LoopInfo.h:100
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:267
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:251
bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, DependenceInfo &DI)
This header provides classes for managing per-loop analyses.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Debug location.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
void remapInstruction(Instruction *I, ValueToValueMapTy &VMap)
Convert the instruction operands from referencing the current values into those specified by VMap...
Definition: LoopUnroll.cpp:66
The loop was fully unrolled into straight-line code.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
SmallPtrSet< BasicBlock *, 4 > BasicBlockSet
bool isDebugInfoForProfiling() const
Returns true if we should emit debug info for profiling.
Definition: Metadata.cpp:1512
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
Conditional or Unconditional Branch instruction.
DomTreeNodeBase * getIDom() const
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, BasicBlockSet &ForeBlocks, BasicBlockSet &SubLoopBlocks, BasicBlockSet &AftBlocks, DominatorTree *DT)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.h:562
Diagnostic information for applied optimization remarks.
const Instruction & back() const
Definition: BasicBlock.h:283
std::vector< BasicBlock * >::const_reverse_iterator RPOIterator
Definition: LoopIterator.h:102
op_range operands()
Definition: User.h:238
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:365
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
iterator end()
Definition: ValueMap.h:142
size_type size() const
Definition: SmallPtrSet.h:93
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches, switches, etc.
Definition: BasicBlock.h:392
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:378
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:249
iterator begin() const
Definition: LoopInfo.h:142
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
#define DEBUG_TYPE
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:98
void push_back(pointer val)
Definition: ilist.h:313
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
static void DFS(BasicBlock *Root, SetVector< BasicBlock *> &Set)
LoopT * getParentLoop() const
Definition: LoopInfo.h:101
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:131
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:193
void registerAssumption(CallInst *CI)
Add an @llvm.assume intrinsic to this function&#39;s cache.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
iterator begin() const
Definition: SmallPtrSet.h:397
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:652
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:149
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
#define I(x, y, z)
Definition: MD5.cpp:58
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
const BasicBlockListType & getBasicBlockList() const
Get the underlying elements of the Function...
Definition: Function.h:633
block_iterator block_end() const
Definition: LoopInfo.h:155
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:325
bool isUnconditional() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
The loop was not modified.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:87
virtual bool anyBlockMayThrow() const
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:41
#define LLVM_DEBUG(X)
Definition: Debug.h:123
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:156
block_iterator block_begin() const
Definition: LoopInfo.h:154
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:50
The optimization diagnostic interface.
iterator begin()
Definition: ValueMap.h:141
const BasicBlock * getParent() const
Definition: Instruction.h:67
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:52
LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop=nullptr)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:522
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC)
Perform some cleanup and simplifications on loops after unrolling.
Definition: LoopUnroll.cpp:258