LLVM  8.0.1
LoopStrengthReduce.cpp
Go to the documentation of this file.
1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This transformation analyzes and transforms the induction variables (and
11 // computations derived from them) into forms suitable for efficient execution
12 // on the target.
13 //
14 // This pass performs a strength reduction on array references inside loops that
15 // have as one or more of their components the loop induction variable, it
16 // rewrites expressions to take advantage of scaled-index addressing modes
17 // available on the target, and it performs a variety of other optimizations
18 // related to loop induction variables.
19 //
20 // Terminology note: this code has a lot of handling for "post-increment" or
21 // "post-inc" users. This is not talking about post-increment addressing modes;
22 // it is instead talking about code like this:
23 //
24 // %i = phi [ 0, %entry ], [ %i.next, %latch ]
25 // ...
26 // %i.next = add %i, 1
27 // %c = icmp eq %i.next, %n
28 //
29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
30 // it's useful to think about these as the same register, with some uses using
31 // the value of the register before the add and some using it after. In this
32 // example, the icmp is a post-increment user, since it uses %i.next, which is
33 // the value of the induction variable after the increment. The other common
34 // case of post-increment users is users outside the loop.
35 //
36 // TODO: More sophistication in the way Formulae are generated and filtered.
37 //
38 // TODO: Handle multiple loops at a time.
39 //
40 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
41 // of a GlobalValue?
42 //
43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
44 // smaller encoding (on x86 at least).
45 //
46 // TODO: When a negated register is used by an add (such as in a list of
47 // multiple base registers, or as the increment expression in an addrec),
48 // we may not actually need both reg and (-1 * reg) in registers; the
49 // negation can be implemented by using a sub instead of an add. The
50 // lack of support for taking this into consideration when making
51 // register pressure decisions is partly worked around by the "Special"
52 // use kind.
53 //
54 //===----------------------------------------------------------------------===//
55 
57 #include "llvm/ADT/APInt.h"
58 #include "llvm/ADT/DenseMap.h"
59 #include "llvm/ADT/DenseSet.h"
60 #include "llvm/ADT/Hashing.h"
62 #include "llvm/ADT/STLExtras.h"
63 #include "llvm/ADT/SetVector.h"
65 #include "llvm/ADT/SmallPtrSet.h"
66 #include "llvm/ADT/SmallSet.h"
67 #include "llvm/ADT/SmallVector.h"
69 #include "llvm/Analysis/IVUsers.h"
71 #include "llvm/Analysis/LoopInfo.h"
72 #include "llvm/Analysis/LoopPass.h"
79 #include "llvm/Config/llvm-config.h"
80 #include "llvm/IR/BasicBlock.h"
81 #include "llvm/IR/Constant.h"
82 #include "llvm/IR/Constants.h"
83 #include "llvm/IR/DerivedTypes.h"
84 #include "llvm/IR/Dominators.h"
85 #include "llvm/IR/GlobalValue.h"
86 #include "llvm/IR/IRBuilder.h"
87 #include "llvm/IR/InstrTypes.h"
88 #include "llvm/IR/Instruction.h"
89 #include "llvm/IR/Instructions.h"
90 #include "llvm/IR/IntrinsicInst.h"
91 #include "llvm/IR/Intrinsics.h"
92 #include "llvm/IR/Module.h"
93 #include "llvm/IR/OperandTraits.h"
94 #include "llvm/IR/Operator.h"
95 #include "llvm/IR/PassManager.h"
96 #include "llvm/IR/Type.h"
97 #include "llvm/IR/Use.h"
98 #include "llvm/IR/User.h"
99 #include "llvm/IR/Value.h"
100 #include "llvm/IR/ValueHandle.h"
101 #include "llvm/Pass.h"
102 #include "llvm/Support/Casting.h"
104 #include "llvm/Support/Compiler.h"
105 #include "llvm/Support/Debug.h"
107 #include "llvm/Support/MathExtras.h"
109 #include "llvm/Transforms/Scalar.h"
110 #include "llvm/Transforms/Utils.h"
112 #include <algorithm>
113 #include <cassert>
114 #include <cstddef>
115 #include <cstdint>
116 #include <cstdlib>
117 #include <iterator>
118 #include <limits>
119 #include <map>
120 #include <utility>
121 
122 using namespace llvm;
123 
124 #define DEBUG_TYPE "loop-reduce"
125 
126 /// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
127 /// bail out. This threshold is far beyond the number of users that LSR can
128 /// conceivably solve, so it should not affect generated code, but catches the
129 /// worst cases before LSR burns too much compile time and stack space.
130 static const unsigned MaxIVUsers = 200;
131 
132 // Temporary flag to cleanup congruent phis after LSR phi expansion.
133 // It's currently disabled until we can determine whether it's truly useful or
134 // not. The flag should be removed after the v3.0 release.
135 // This is now needed for ivchains.
137  "enable-lsr-phielim", cl::Hidden, cl::init(true),
138  cl::desc("Enable LSR phi elimination"));
139 
140 // The flag adds instruction count to solutions cost comparision.
141 static cl::opt<bool> InsnsCost(
142  "lsr-insns-cost", cl::Hidden, cl::init(true),
143  cl::desc("Add instruction count to a LSR cost model"));
144 
145 // Flag to choose how to narrow complex lsr solution
147  "lsr-exp-narrow", cl::Hidden, cl::init(false),
148  cl::desc("Narrow LSR complex solution using"
149  " expectation of registers number"));
150 
151 // Flag to narrow search space by filtering non-optimal formulae with
152 // the same ScaledReg and Scale.
154  "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
155  cl::desc("Narrow LSR search space by filtering non-optimal formulae"
156  " with the same ScaledReg and Scale"));
157 
159  "lsr-complexity-limit", cl::Hidden,
161  cl::desc("LSR search space complexity limit"));
162 
163 #ifndef NDEBUG
164 // Stress test IV chain generation.
166  "stress-ivchain", cl::Hidden, cl::init(false),
167  cl::desc("Stress test LSR IV chains"));
168 #else
169 static bool StressIVChain = false;
170 #endif
171 
172 namespace {
173 
174 struct MemAccessTy {
175  /// Used in situations where the accessed memory type is unknown.
176  static const unsigned UnknownAddressSpace =
178 
179  Type *MemTy = nullptr;
180  unsigned AddrSpace = UnknownAddressSpace;
181 
182  MemAccessTy() = default;
183  MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
184 
185  bool operator==(MemAccessTy Other) const {
186  return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
187  }
188 
189  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
190 
191  static MemAccessTy getUnknown(LLVMContext &Ctx,
192  unsigned AS = UnknownAddressSpace) {
193  return MemAccessTy(Type::getVoidTy(Ctx), AS);
194  }
195 
196  Type *getType() { return MemTy; }
197 };
198 
199 /// This class holds data which is used to order reuse candidates.
200 class RegSortData {
201 public:
202  /// This represents the set of LSRUse indices which reference
203  /// a particular register.
204  SmallBitVector UsedByIndices;
205 
206  void print(raw_ostream &OS) const;
207  void dump() const;
208 };
209 
210 } // end anonymous namespace
211 
212 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
213 void RegSortData::print(raw_ostream &OS) const {
214  OS << "[NumUses=" << UsedByIndices.count() << ']';
215 }
216 
217 LLVM_DUMP_METHOD void RegSortData::dump() const {
218  print(errs()); errs() << '\n';
219 }
220 #endif
221 
222 namespace {
223 
224 /// Map register candidates to information about how they are used.
225 class RegUseTracker {
226  using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
227 
228  RegUsesTy RegUsesMap;
230 
231 public:
232  void countRegister(const SCEV *Reg, size_t LUIdx);
233  void dropRegister(const SCEV *Reg, size_t LUIdx);
234  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
235 
236  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
237 
238  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
239 
240  void clear();
241 
243  using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
244 
245  iterator begin() { return RegSequence.begin(); }
246  iterator end() { return RegSequence.end(); }
247  const_iterator begin() const { return RegSequence.begin(); }
248  const_iterator end() const { return RegSequence.end(); }
249 };
250 
251 } // end anonymous namespace
252 
253 void
254 RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
255  std::pair<RegUsesTy::iterator, bool> Pair =
256  RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
257  RegSortData &RSD = Pair.first->second;
258  if (Pair.second)
259  RegSequence.push_back(Reg);
260  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
261  RSD.UsedByIndices.set(LUIdx);
262 }
263 
264 void
265 RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
266  RegUsesTy::iterator It = RegUsesMap.find(Reg);
267  assert(It != RegUsesMap.end());
268  RegSortData &RSD = It->second;
269  assert(RSD.UsedByIndices.size() > LUIdx);
270  RSD.UsedByIndices.reset(LUIdx);
271 }
272 
273 void
274 RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
275  assert(LUIdx <= LastLUIdx);
276 
277  // Update RegUses. The data structure is not optimized for this purpose;
278  // we must iterate through it and update each of the bit vectors.
279  for (auto &Pair : RegUsesMap) {
280  SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
281  if (LUIdx < UsedByIndices.size())
282  UsedByIndices[LUIdx] =
283  LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
284  UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
285  }
286 }
287 
288 bool
289 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
290  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
291  if (I == RegUsesMap.end())
292  return false;
293  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
294  int i = UsedByIndices.find_first();
295  if (i == -1) return false;
296  if ((size_t)i != LUIdx) return true;
297  return UsedByIndices.find_next(i) != -1;
298 }
299 
300 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
301  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
302  assert(I != RegUsesMap.end() && "Unknown register!");
303  return I->second.UsedByIndices;
304 }
305 
306 void RegUseTracker::clear() {
307  RegUsesMap.clear();
308  RegSequence.clear();
309 }
310 
311 namespace {
312 
313 /// This class holds information that describes a formula for computing
314 /// satisfying a use. It may include broken-out immediates and scaled registers.
315 struct Formula {
316  /// Global base address used for complex addressing.
317  GlobalValue *BaseGV = nullptr;
318 
319  /// Base offset for complex addressing.
320  int64_t BaseOffset = 0;
321 
322  /// Whether any complex addressing has a base register.
323  bool HasBaseReg = false;
324 
325  /// The scale of any complex addressing.
326  int64_t Scale = 0;
327 
328  /// The list of "base" registers for this use. When this is non-empty. The
329  /// canonical representation of a formula is
330  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
331  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
332  /// 3. The reg containing recurrent expr related with currect loop in the
333  /// formula should be put in the ScaledReg.
334  /// #1 enforces that the scaled register is always used when at least two
335  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
336  /// #2 enforces that 1 * reg is reg.
337  /// #3 ensures invariant regs with respect to current loop can be combined
338  /// together in LSR codegen.
339  /// This invariant can be temporarily broken while building a formula.
340  /// However, every formula inserted into the LSRInstance must be in canonical
341  /// form.
343 
344  /// The 'scaled' register for this use. This should be non-null when Scale is
345  /// not zero.
346  const SCEV *ScaledReg = nullptr;
347 
348  /// An additional constant offset which added near the use. This requires a
349  /// temporary register, but the offset itself can live in an add immediate
350  /// field rather than a register.
351  int64_t UnfoldedOffset = 0;
352 
353  Formula() = default;
354 
355  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
356 
357  bool isCanonical(const Loop &L) const;
358 
359  void canonicalize(const Loop &L);
360 
361  bool unscale();
362 
363  bool hasZeroEnd() const;
364 
365  size_t getNumRegs() const;
366  Type *getType() const;
367 
368  void deleteBaseReg(const SCEV *&S);
369 
370  bool referencesReg(const SCEV *S) const;
371  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
372  const RegUseTracker &RegUses) const;
373 
374  void print(raw_ostream &OS) const;
375  void dump() const;
376 };
377 
378 } // end anonymous namespace
379 
380 /// Recursion helper for initialMatch.
381 static void DoInitialMatch(const SCEV *S, Loop *L,
384  ScalarEvolution &SE) {
385  // Collect expressions which properly dominate the loop header.
386  if (SE.properlyDominates(S, L->getHeader())) {
387  Good.push_back(S);
388  return;
389  }
390 
391  // Look at add operands.
392  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
393  for (const SCEV *S : Add->operands())
394  DoInitialMatch(S, L, Good, Bad, SE);
395  return;
396  }
397 
398  // Look at addrec operands.
399  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
400  if (!AR->getStart()->isZero() && AR->isAffine()) {
401  DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
402  DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
403  AR->getStepRecurrence(SE),
404  // FIXME: AR->getNoWrapFlags()
405  AR->getLoop(), SCEV::FlagAnyWrap),
406  L, Good, Bad, SE);
407  return;
408  }
409 
410  // Handle a multiplication by -1 (negation) if it didn't fold.
411  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
412  if (Mul->getOperand(0)->isAllOnesValue()) {
413  SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
414  const SCEV *NewMul = SE.getMulExpr(Ops);
415 
418  DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
419  const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
420  SE.getEffectiveSCEVType(NewMul->getType())));
421  for (const SCEV *S : MyGood)
422  Good.push_back(SE.getMulExpr(NegOne, S));
423  for (const SCEV *S : MyBad)
424  Bad.push_back(SE.getMulExpr(NegOne, S));
425  return;
426  }
427 
428  // Ok, we can't do anything interesting. Just stuff the whole thing into a
429  // register and hope for the best.
430  Bad.push_back(S);
431 }
432 
433 /// Incorporate loop-variant parts of S into this Formula, attempting to keep
434 /// all loop-invariant and loop-computable values in a single base register.
435 void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
438  DoInitialMatch(S, L, Good, Bad, SE);
439  if (!Good.empty()) {
440  const SCEV *Sum = SE.getAddExpr(Good);
441  if (!Sum->isZero())
442  BaseRegs.push_back(Sum);
443  HasBaseReg = true;
444  }
445  if (!Bad.empty()) {
446  const SCEV *Sum = SE.getAddExpr(Bad);
447  if (!Sum->isZero())
448  BaseRegs.push_back(Sum);
449  HasBaseReg = true;
450  }
451  canonicalize(*L);
452 }
453 
454 /// Check whether or not this formula satisfies the canonical
455 /// representation.
456 /// \see Formula::BaseRegs.
457 bool Formula::isCanonical(const Loop &L) const {
458  if (!ScaledReg)
459  return BaseRegs.size() <= 1;
460 
461  if (Scale != 1)
462  return true;
463 
464  if (Scale == 1 && BaseRegs.empty())
465  return false;
466 
467  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
468  if (SAR && SAR->getLoop() == &L)
469  return true;
470 
471  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
472  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
473  // loop, we want to swap the reg in BaseRegs with ScaledReg.
474  auto I =
475  find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
476  return isa<const SCEVAddRecExpr>(S) &&
477  (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
478  });
479  return I == BaseRegs.end();
480 }
481 
482 /// Helper method to morph a formula into its canonical representation.
483 /// \see Formula::BaseRegs.
484 /// Every formula having more than one base register, must use the ScaledReg
485 /// field. Otherwise, we would have to do special cases everywhere in LSR
486 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
487 /// On the other hand, 1*reg should be canonicalized into reg.
488 void Formula::canonicalize(const Loop &L) {
489  if (isCanonical(L))
490  return;
491  // So far we did not need this case. This is easy to implement but it is
492  // useless to maintain dead code. Beside it could hurt compile time.
493  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
494 
495  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
496  if (!ScaledReg) {
497  ScaledReg = BaseRegs.back();
498  BaseRegs.pop_back();
499  Scale = 1;
500  }
501 
502  // If ScaledReg is an invariant with respect to L, find the reg from
503  // BaseRegs containing the recurrent expr related with Loop L. Swap the
504  // reg with ScaledReg.
505  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
506  if (!SAR || SAR->getLoop() != &L) {
507  auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
508  [&](const SCEV *S) {
509  return isa<const SCEVAddRecExpr>(S) &&
510  (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
511  });
512  if (I != BaseRegs.end())
513  std::swap(ScaledReg, *I);
514  }
515 }
516 
517 /// Get rid of the scale in the formula.
518 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
519 /// \return true if it was possible to get rid of the scale, false otherwise.
520 /// \note After this operation the formula may not be in the canonical form.
521 bool Formula::unscale() {
522  if (Scale != 1)
523  return false;
524  Scale = 0;
525  BaseRegs.push_back(ScaledReg);
526  ScaledReg = nullptr;
527  return true;
528 }
529 
530 bool Formula::hasZeroEnd() const {
531  if (UnfoldedOffset || BaseOffset)
532  return false;
533  if (BaseRegs.size() != 1 || ScaledReg)
534  return false;
535  return true;
536 }
537 
538 /// Return the total number of register operands used by this formula. This does
539 /// not include register uses implied by non-constant addrec strides.
540 size_t Formula::getNumRegs() const {
541  return !!ScaledReg + BaseRegs.size();
542 }
543 
544 /// Return the type of this formula, if it has one, or null otherwise. This type
545 /// is meaningless except for the bit size.
546 Type *Formula::getType() const {
547  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
548  ScaledReg ? ScaledReg->getType() :
549  BaseGV ? BaseGV->getType() :
550  nullptr;
551 }
552 
553 /// Delete the given base reg from the BaseRegs list.
554 void Formula::deleteBaseReg(const SCEV *&S) {
555  if (&S != &BaseRegs.back())
556  std::swap(S, BaseRegs.back());
557  BaseRegs.pop_back();
558 }
559 
560 /// Test if this formula references the given register.
561 bool Formula::referencesReg(const SCEV *S) const {
562  return S == ScaledReg || is_contained(BaseRegs, S);
563 }
564 
565 /// Test whether this formula uses registers which are used by uses other than
566 /// the use with the given index.
567 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
568  const RegUseTracker &RegUses) const {
569  if (ScaledReg)
570  if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
571  return true;
572  for (const SCEV *BaseReg : BaseRegs)
573  if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
574  return true;
575  return false;
576 }
577 
578 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
579 void Formula::print(raw_ostream &OS) const {
580  bool First = true;
581  if (BaseGV) {
582  if (!First) OS << " + "; else First = false;
583  BaseGV->printAsOperand(OS, /*PrintType=*/false);
584  }
585  if (BaseOffset != 0) {
586  if (!First) OS << " + "; else First = false;
587  OS << BaseOffset;
588  }
589  for (const SCEV *BaseReg : BaseRegs) {
590  if (!First) OS << " + "; else First = false;
591  OS << "reg(" << *BaseReg << ')';
592  }
593  if (HasBaseReg && BaseRegs.empty()) {
594  if (!First) OS << " + "; else First = false;
595  OS << "**error: HasBaseReg**";
596  } else if (!HasBaseReg && !BaseRegs.empty()) {
597  if (!First) OS << " + "; else First = false;
598  OS << "**error: !HasBaseReg**";
599  }
600  if (Scale != 0) {
601  if (!First) OS << " + "; else First = false;
602  OS << Scale << "*reg(";
603  if (ScaledReg)
604  OS << *ScaledReg;
605  else
606  OS << "<unknown>";
607  OS << ')';
608  }
609  if (UnfoldedOffset != 0) {
610  if (!First) OS << " + ";
611  OS << "imm(" << UnfoldedOffset << ')';
612  }
613 }
614 
615 LLVM_DUMP_METHOD void Formula::dump() const {
616  print(errs()); errs() << '\n';
617 }
618 #endif
619 
620 /// Return true if the given addrec can be sign-extended without changing its
621 /// value.
622 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
623  Type *WideTy =
625  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
626 }
627 
628 /// Return true if the given add can be sign-extended without changing its
629 /// value.
630 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
631  Type *WideTy =
633  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
634 }
635 
636 /// Return true if the given mul can be sign-extended without changing its
637 /// value.
638 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
639  Type *WideTy =
641  SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
642  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
643 }
644 
645 /// Return an expression for LHS /s RHS, if it can be determined and if the
646 /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
647 /// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
648 /// the multiplication may overflow, which is useful when the result will be
649 /// used in a context where the most significant bits are ignored.
650 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
651  ScalarEvolution &SE,
652  bool IgnoreSignificantBits = false) {
653  // Handle the trivial case, which works for any SCEV type.
654  if (LHS == RHS)
655  return SE.getConstant(LHS->getType(), 1);
656 
657  // Handle a few RHS special cases.
658  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
659  if (RC) {
660  const APInt &RA = RC->getAPInt();
661  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
662  // some folding.
663  if (RA.isAllOnesValue())
664  return SE.getMulExpr(LHS, RC);
665  // Handle x /s 1 as x.
666  if (RA == 1)
667  return LHS;
668  }
669 
670  // Check for a division of a constant by a constant.
671  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
672  if (!RC)
673  return nullptr;
674  const APInt &LA = C->getAPInt();
675  const APInt &RA = RC->getAPInt();
676  if (LA.srem(RA) != 0)
677  return nullptr;
678  return SE.getConstant(LA.sdiv(RA));
679  }
680 
681  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
682  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
683  if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
684  const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
685  IgnoreSignificantBits);
686  if (!Step) return nullptr;
687  const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
688  IgnoreSignificantBits);
689  if (!Start) return nullptr;
690  // FlagNW is independent of the start value, step direction, and is
691  // preserved with smaller magnitude steps.
692  // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
693  return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
694  }
695  return nullptr;
696  }
697 
698  // Distribute the sdiv over add operands, if the add doesn't overflow.
699  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
700  if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
702  for (const SCEV *S : Add->operands()) {
703  const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
704  if (!Op) return nullptr;
705  Ops.push_back(Op);
706  }
707  return SE.getAddExpr(Ops);
708  }
709  return nullptr;
710  }
711 
712  // Check for a multiply operand that we can pull RHS out of.
713  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
714  if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
716  bool Found = false;
717  for (const SCEV *S : Mul->operands()) {
718  if (!Found)
719  if (const SCEV *Q = getExactSDiv(S, RHS, SE,
720  IgnoreSignificantBits)) {
721  S = Q;
722  Found = true;
723  }
724  Ops.push_back(S);
725  }
726  return Found ? SE.getMulExpr(Ops) : nullptr;
727  }
728  return nullptr;
729  }
730 
731  // Otherwise we don't know.
732  return nullptr;
733 }
734 
735 /// If S involves the addition of a constant integer value, return that integer
736 /// value, and mutate S to point to a new SCEV with that value excluded.
737 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
738  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
739  if (C->getAPInt().getMinSignedBits() <= 64) {
740  S = SE.getConstant(C->getType(), 0);
741  return C->getValue()->getSExtValue();
742  }
743  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
744  SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
745  int64_t Result = ExtractImmediate(NewOps.front(), SE);
746  if (Result != 0)
747  S = SE.getAddExpr(NewOps);
748  return Result;
749  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
750  SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
751  int64_t Result = ExtractImmediate(NewOps.front(), SE);
752  if (Result != 0)
753  S = SE.getAddRecExpr(NewOps, AR->getLoop(),
754  // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
756  return Result;
757  }
758  return 0;
759 }
760 
761 /// If S involves the addition of a GlobalValue address, return that symbol, and
762 /// mutate S to point to a new SCEV with that value excluded.
764  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
765  if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
766  S = SE.getConstant(GV->getType(), 0);
767  return GV;
768  }
769  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
770  SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
771  GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
772  if (Result)
773  S = SE.getAddExpr(NewOps);
774  return Result;
775  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
776  SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
777  GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
778  if (Result)
779  S = SE.getAddRecExpr(NewOps, AR->getLoop(),
780  // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
782  return Result;
783  }
784  return nullptr;
785 }
786 
787 /// Returns true if the specified instruction is using the specified value as an
788 /// address.
789 static bool isAddressUse(const TargetTransformInfo &TTI,
790  Instruction *Inst, Value *OperandVal) {
791  bool isAddress = isa<LoadInst>(Inst);
792  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
793  if (SI->getPointerOperand() == OperandVal)
794  isAddress = true;
795  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
796  // Addressing modes can also be folded into prefetches and a variety
797  // of intrinsics.
798  switch (II->getIntrinsicID()) {
799  case Intrinsic::memset:
800  case Intrinsic::prefetch:
801  if (II->getArgOperand(0) == OperandVal)
802  isAddress = true;
803  break;
804  case Intrinsic::memmove:
805  case Intrinsic::memcpy:
806  if (II->getArgOperand(0) == OperandVal ||
807  II->getArgOperand(1) == OperandVal)
808  isAddress = true;
809  break;
810  default: {
811  MemIntrinsicInfo IntrInfo;
812  if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
813  if (IntrInfo.PtrVal == OperandVal)
814  isAddress = true;
815  }
816  }
817  }
818  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
819  if (RMW->getPointerOperand() == OperandVal)
820  isAddress = true;
821  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
822  if (CmpX->getPointerOperand() == OperandVal)
823  isAddress = true;
824  }
825  return isAddress;
826 }
827 
828 /// Return the type of the memory being accessed.
829 static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
830  Instruction *Inst, Value *OperandVal) {
831  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
832  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
833  AccessTy.MemTy = SI->getOperand(0)->getType();
834  AccessTy.AddrSpace = SI->getPointerAddressSpace();
835  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
836  AccessTy.AddrSpace = LI->getPointerAddressSpace();
837  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
838  AccessTy.AddrSpace = RMW->getPointerAddressSpace();
839  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
840  AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
841  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
842  switch (II->getIntrinsicID()) {
843  case Intrinsic::prefetch:
844  case Intrinsic::memset:
845  AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
846  AccessTy.MemTy = OperandVal->getType();
847  break;
848  case Intrinsic::memmove:
849  case Intrinsic::memcpy:
850  AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
851  AccessTy.MemTy = OperandVal->getType();
852  break;
853  default: {
854  MemIntrinsicInfo IntrInfo;
855  if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
856  AccessTy.AddrSpace
857  = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
858  }
859 
860  break;
861  }
862  }
863  }
864 
865  // All pointers have the same requirements, so canonicalize them to an
866  // arbitrary pointer type to minimize variation.
867  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
868  AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
869  PTy->getAddressSpace());
870 
871  return AccessTy;
872 }
873 
874 /// Return true if this AddRec is already a phi in its loop.
875 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
876  for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
877  if (SE.isSCEVable(PN.getType()) &&
878  (SE.getEffectiveSCEVType(PN.getType()) ==
879  SE.getEffectiveSCEVType(AR->getType())) &&
880  SE.getSCEV(&PN) == AR)
881  return true;
882  }
883  return false;
884 }
885 
886 /// Check if expanding this expression is likely to incur significant cost. This
887 /// is tricky because SCEV doesn't track which expressions are actually computed
888 /// by the current IR.
889 ///
890 /// We currently allow expansion of IV increments that involve adds,
891 /// multiplication by constants, and AddRecs from existing phis.
892 ///
893 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
894 /// obvious multiple of the UDivExpr.
895 static bool isHighCostExpansion(const SCEV *S,
896  SmallPtrSetImpl<const SCEV*> &Processed,
897  ScalarEvolution &SE) {
898  // Zero/One operand expressions
899  switch (S->getSCEVType()) {
900  case scUnknown:
901  case scConstant:
902  return false;
903  case scTruncate:
904  return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
905  Processed, SE);
906  case scZeroExtend:
907  return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
908  Processed, SE);
909  case scSignExtend:
910  return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
911  Processed, SE);
912  }
913 
914  if (!Processed.insert(S).second)
915  return false;
916 
917  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
918  for (const SCEV *S : Add->operands()) {
919  if (isHighCostExpansion(S, Processed, SE))
920  return true;
921  }
922  return false;
923  }
924 
925  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
926  if (Mul->getNumOperands() == 2) {
927  // Multiplication by a constant is ok
928  if (isa<SCEVConstant>(Mul->getOperand(0)))
929  return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
930 
931  // If we have the value of one operand, check if an existing
932  // multiplication already generates this expression.
933  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
934  Value *UVal = U->getValue();
935  for (User *UR : UVal->users()) {
936  // If U is a constant, it may be used by a ConstantExpr.
937  Instruction *UI = dyn_cast<Instruction>(UR);
938  if (UI && UI->getOpcode() == Instruction::Mul &&
939  SE.isSCEVable(UI->getType())) {
940  return SE.getSCEV(UI) == Mul;
941  }
942  }
943  }
944  }
945  }
946 
947  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
948  if (isExistingPhi(AR, SE))
949  return false;
950  }
951 
952  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
953  return true;
954 }
955 
956 /// If any of the instructions in the specified set are trivially dead, delete
957 /// them and see if this makes any of their operands subsequently dead.
958 static bool
960  bool Changed = false;
961 
962  while (!DeadInsts.empty()) {
963  Value *V = DeadInsts.pop_back_val();
964  Instruction *I = dyn_cast_or_null<Instruction>(V);
965 
966  if (!I || !isInstructionTriviallyDead(I))
967  continue;
968 
969  for (Use &O : I->operands())
970  if (Instruction *U = dyn_cast<Instruction>(O)) {
971  O = nullptr;
972  if (U->use_empty())
973  DeadInsts.emplace_back(U);
974  }
975 
976  I->eraseFromParent();
977  Changed = true;
978  }
979 
980  return Changed;
981 }
982 
983 namespace {
984 
985 class LSRUse;
986 
987 } // end anonymous namespace
988 
989 /// Check if the addressing mode defined by \p F is completely
990 /// folded in \p LU at isel time.
991 /// This includes address-mode folding and special icmp tricks.
992 /// This function returns true if \p LU can accommodate what \p F
993 /// defines and up to 1 base + 1 scaled + offset.
994 /// In other words, if \p F has several base registers, this function may
995 /// still return true. Therefore, users still need to account for
996 /// additional base registers and/or unfolded offsets to derive an
997 /// accurate cost model.
998 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
999  const LSRUse &LU, const Formula &F);
1000 
1001 // Get the cost of the scaling factor used in F for LU.
1002 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1003  const LSRUse &LU, const Formula &F,
1004  const Loop &L);
1005 
1006 namespace {
1007 
1008 /// This class is used to measure and compare candidate formulae.
1009 class Cost {
1011 
1012 public:
1013  Cost() {
1014  C.Insns = 0;
1015  C.NumRegs = 0;
1016  C.AddRecCost = 0;
1017  C.NumIVMuls = 0;
1018  C.NumBaseAdds = 0;
1019  C.ImmCost = 0;
1020  C.SetupCost = 0;
1021  C.ScaleCost = 0;
1022  }
1023 
1024  bool isLess(Cost &Other, const TargetTransformInfo &TTI);
1025 
1026  void Lose();
1027 
1028 #ifndef NDEBUG
1029  // Once any of the metrics loses, they must all remain losers.
1030  bool isValid() {
1031  return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1032  | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1033  || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1034  & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1035  }
1036 #endif
1037 
1038  bool isLoser() {
1039  assert(isValid() && "invalid cost");
1040  return C.NumRegs == ~0u;
1041  }
1042 
1043  void RateFormula(const TargetTransformInfo &TTI,
1044  const Formula &F,
1046  const DenseSet<const SCEV *> &VisitedRegs,
1047  const Loop *L,
1048  ScalarEvolution &SE, DominatorTree &DT,
1049  const LSRUse &LU,
1050  SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1051 
1052  void print(raw_ostream &OS) const;
1053  void dump() const;
1054 
1055 private:
1056  void RateRegister(const SCEV *Reg,
1058  const Loop *L,
1059  ScalarEvolution &SE, DominatorTree &DT,
1060  const TargetTransformInfo &TTI);
1061  void RatePrimaryRegister(const SCEV *Reg,
1063  const Loop *L,
1064  ScalarEvolution &SE, DominatorTree &DT,
1065  SmallPtrSetImpl<const SCEV *> *LoserRegs,
1066  const TargetTransformInfo &TTI);
1067 };
1068 
1069 /// An operand value in an instruction which is to be replaced with some
1070 /// equivalent, possibly strength-reduced, replacement.
1071 struct LSRFixup {
1072  /// The instruction which will be updated.
1073  Instruction *UserInst = nullptr;
1074 
1075  /// The operand of the instruction which will be replaced. The operand may be
1076  /// used more than once; every instance will be replaced.
1077  Value *OperandValToReplace = nullptr;
1078 
1079  /// If this user is to use the post-incremented value of an induction
1080  /// variable, this set is non-empty and holds the loops associated with the
1081  /// induction variable.
1082  PostIncLoopSet PostIncLoops;
1083 
1084  /// A constant offset to be added to the LSRUse expression. This allows
1085  /// multiple fixups to share the same LSRUse with different offsets, for
1086  /// example in an unrolled loop.
1087  int64_t Offset = 0;
1088 
1089  LSRFixup() = default;
1090 
1091  bool isUseFullyOutsideLoop(const Loop *L) const;
1092 
1093  void print(raw_ostream &OS) const;
1094  void dump() const;
1095 };
1096 
1097 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1098 /// SmallVectors of const SCEV*.
1099 struct UniquifierDenseMapInfo {
1100  static SmallVector<const SCEV *, 4> getEmptyKey() {
1102  V.push_back(reinterpret_cast<const SCEV *>(-1));
1103  return V;
1104  }
1105 
1106  static SmallVector<const SCEV *, 4> getTombstoneKey() {
1108  V.push_back(reinterpret_cast<const SCEV *>(-2));
1109  return V;
1110  }
1111 
1112  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1113  return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1114  }
1115 
1116  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1117  const SmallVector<const SCEV *, 4> &RHS) {
1118  return LHS == RHS;
1119  }
1120 };
1121 
1122 /// This class holds the state that LSR keeps for each use in IVUsers, as well
1123 /// as uses invented by LSR itself. It includes information about what kinds of
1124 /// things can be folded into the user, information about the user itself, and
1125 /// information about how the use may be satisfied. TODO: Represent multiple
1126 /// users of the same expression in common?
1127 class LSRUse {
1128  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1129 
1130 public:
1131  /// An enum for a kind of use, indicating what types of scaled and immediate
1132  /// operands it might support.
1133  enum KindType {
1134  Basic, ///< A normal use, with no folding.
1135  Special, ///< A special case of basic, allowing -1 scales.
1136  Address, ///< An address use; folding according to TargetLowering
1137  ICmpZero ///< An equality icmp with both operands folded into one.
1138  // TODO: Add a generic icmp too?
1139  };
1140 
1141  using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1142 
1143  KindType Kind;
1144  MemAccessTy AccessTy;
1145 
1146  /// The list of operands which are to be replaced.
1148 
1149  /// Keep track of the min and max offsets of the fixups.
1150  int64_t MinOffset = std::numeric_limits<int64_t>::max();
1151  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
1152 
1153  /// This records whether all of the fixups using this LSRUse are outside of
1154  /// the loop, in which case some special-case heuristics may be used.
1155  bool AllFixupsOutsideLoop = true;
1156 
1157  /// RigidFormula is set to true to guarantee that this use will be associated
1158  /// with a single formula--the one that initially matched. Some SCEV
1159  /// expressions cannot be expanded. This allows LSR to consider the registers
1160  /// used by those expressions without the need to expand them later after
1161  /// changing the formula.
1162  bool RigidFormula = false;
1163 
1164  /// This records the widest use type for any fixup using this
1165  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1166  /// fixup widths to be equivalent, because the narrower one may be relying on
1167  /// the implicit truncation to truncate away bogus bits.
1168  Type *WidestFixupType = nullptr;
1169 
1170  /// A list of ways to build a value that can satisfy this user. After the
1171  /// list is populated, one of these is selected heuristically and used to
1172  /// formulate a replacement for OperandValToReplace in UserInst.
1173  SmallVector<Formula, 12> Formulae;
1174 
1175  /// The set of register candidates used by all formulae in this LSRUse.
1177 
1178  LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1179 
1180  LSRFixup &getNewFixup() {
1181  Fixups.push_back(LSRFixup());
1182  return Fixups.back();
1183  }
1184 
1185  void pushFixup(LSRFixup &f) {
1186  Fixups.push_back(f);
1187  if (f.Offset > MaxOffset)
1188  MaxOffset = f.Offset;
1189  if (f.Offset < MinOffset)
1190  MinOffset = f.Offset;
1191  }
1192 
1193  bool HasFormulaWithSameRegs(const Formula &F) const;
1194  float getNotSelectedProbability(const SCEV *Reg) const;
1195  bool InsertFormula(const Formula &F, const Loop &L);
1196  void DeleteFormula(Formula &F);
1197  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1198 
1199  void print(raw_ostream &OS) const;
1200  void dump() const;
1201 };
1202 
1203 } // end anonymous namespace
1204 
1205 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1206  LSRUse::KindType Kind, MemAccessTy AccessTy,
1207  GlobalValue *BaseGV, int64_t BaseOffset,
1208  bool HasBaseReg, int64_t Scale,
1209  Instruction *Fixup = nullptr);
1210 
1211 /// Tally up interesting quantities from the given register.
1212 void Cost::RateRegister(const SCEV *Reg,
1214  const Loop *L,
1215  ScalarEvolution &SE, DominatorTree &DT,
1216  const TargetTransformInfo &TTI) {
1217  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1218  // If this is an addrec for another loop, it should be an invariant
1219  // with respect to L since L is the innermost loop (at least
1220  // for now LSR only handles innermost loops).
1221  if (AR->getLoop() != L) {
1222  // If the AddRec exists, consider it's register free and leave it alone.
1223  if (isExistingPhi(AR, SE))
1224  return;
1225 
1226  // It is bad to allow LSR for current loop to add induction variables
1227  // for its sibling loops.
1228  if (!AR->getLoop()->contains(L)) {
1229  Lose();
1230  return;
1231  }
1232 
1233  // Otherwise, it will be an invariant with respect to Loop L.
1234  ++C.NumRegs;
1235  return;
1236  }
1237 
1238  unsigned LoopCost = 1;
1239  if (TTI.shouldFavorPostInc()) {
1240  const SCEV *LoopStep = AR->getStepRecurrence(SE);
1241  if (isa<SCEVConstant>(LoopStep)) {
1242  // Check if a post-indexed load/store can be used.
1243  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
1244  TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
1245  const SCEV *LoopStart = AR->getStart();
1246  if (!isa<SCEVConstant>(LoopStart) &&
1247  SE.isLoopInvariant(LoopStart, L))
1248  LoopCost = 0;
1249  }
1250  }
1251  }
1252  C.AddRecCost += LoopCost;
1253 
1254  // Add the step value register, if it needs one.
1255  // TODO: The non-affine case isn't precisely modeled here.
1256  if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1257  if (!Regs.count(AR->getOperand(1))) {
1258  RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
1259  if (isLoser())
1260  return;
1261  }
1262  }
1263  }
1264  ++C.NumRegs;
1265 
1266  // Rough heuristic; favor registers which don't require extra setup
1267  // instructions in the preheader.
1268  if (!isa<SCEVUnknown>(Reg) &&
1269  !isa<SCEVConstant>(Reg) &&
1270  !(isa<SCEVAddRecExpr>(Reg) &&
1271  (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
1272  isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
1273  ++C.SetupCost;
1274 
1275  C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1276  SE.hasComputableLoopEvolution(Reg, L);
1277 }
1278 
1279 /// Record this register in the set. If we haven't seen it before, rate
1280 /// it. Optional LoserRegs provides a way to declare any formula that refers to
1281 /// one of those regs an instant loser.
1282 void Cost::RatePrimaryRegister(const SCEV *Reg,
1284  const Loop *L,
1285  ScalarEvolution &SE, DominatorTree &DT,
1286  SmallPtrSetImpl<const SCEV *> *LoserRegs,
1287  const TargetTransformInfo &TTI) {
1288  if (LoserRegs && LoserRegs->count(Reg)) {
1289  Lose();
1290  return;
1291  }
1292  if (Regs.insert(Reg).second) {
1293  RateRegister(Reg, Regs, L, SE, DT, TTI);
1294  if (LoserRegs && isLoser())
1295  LoserRegs->insert(Reg);
1296  }
1297 }
1298 
1299 void Cost::RateFormula(const TargetTransformInfo &TTI,
1300  const Formula &F,
1302  const DenseSet<const SCEV *> &VisitedRegs,
1303  const Loop *L,
1304  ScalarEvolution &SE, DominatorTree &DT,
1305  const LSRUse &LU,
1306  SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1307  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1308  // Tally up the registers.
1309  unsigned PrevAddRecCost = C.AddRecCost;
1310  unsigned PrevNumRegs = C.NumRegs;
1311  unsigned PrevNumBaseAdds = C.NumBaseAdds;
1312  if (const SCEV *ScaledReg = F.ScaledReg) {
1313  if (VisitedRegs.count(ScaledReg)) {
1314  Lose();
1315  return;
1316  }
1317  RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
1318  if (isLoser())
1319  return;
1320  }
1321  for (const SCEV *BaseReg : F.BaseRegs) {
1322  if (VisitedRegs.count(BaseReg)) {
1323  Lose();
1324  return;
1325  }
1326  RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
1327  if (isLoser())
1328  return;
1329  }
1330 
1331  // Determine how many (unfolded) adds we'll need inside the loop.
1332  size_t NumBaseParts = F.getNumRegs();
1333  if (NumBaseParts > 1)
1334  // Do not count the base and a possible second register if the target
1335  // allows to fold 2 registers.
1336  C.NumBaseAdds +=
1337  NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
1338  C.NumBaseAdds += (F.UnfoldedOffset != 0);
1339 
1340  // Accumulate non-free scaling amounts.
1341  C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
1342 
1343  // Tally up the non-zero immediates.
1344  for (const LSRFixup &Fixup : LU.Fixups) {
1345  int64_t O = Fixup.Offset;
1346  int64_t Offset = (uint64_t)O + F.BaseOffset;
1347  if (F.BaseGV)
1348  C.ImmCost += 64; // Handle symbolic values conservatively.
1349  // TODO: This should probably be the pointer size.
1350  else if (Offset != 0)
1351  C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
1352 
1353  // Check with target if this offset with this instruction is
1354  // specifically not supported.
1355  if (LU.Kind == LSRUse::Address && Offset != 0 &&
1356  !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1357  Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1358  C.NumBaseAdds++;
1359  }
1360 
1361  // If we don't count instruction cost exit here.
1362  if (!InsnsCost) {
1363  assert(isValid() && "invalid cost");
1364  return;
1365  }
1366 
1367  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1368  // additional instruction (at least fill).
1369  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
1370  if (C.NumRegs > TTIRegNum) {
1371  // Cost already exceeded TTIRegNum, then only newly added register can add
1372  // new instructions.
1373  if (PrevNumRegs > TTIRegNum)
1374  C.Insns += (C.NumRegs - PrevNumRegs);
1375  else
1376  C.Insns += (C.NumRegs - TTIRegNum);
1377  }
1378 
1379  // If ICmpZero formula ends with not 0, it could not be replaced by
1380  // just add or sub. We'll need to compare final result of AddRec.
1381  // That means we'll need an additional instruction. But if the target can
1382  // macro-fuse a compare with a branch, don't count this extra instruction.
1383  // For -10 + {0, +, 1}:
1384  // i = i + 1;
1385  // cmp i, 10
1386  //
1387  // For {-10, +, 1}:
1388  // i = i + 1;
1389  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
1390  C.Insns++;
1391  // Each new AddRec adds 1 instruction to calculation.
1392  C.Insns += (C.AddRecCost - PrevAddRecCost);
1393 
1394  // BaseAdds adds instructions for unfolded registers.
1395  if (LU.Kind != LSRUse::ICmpZero)
1396  C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1397  assert(isValid() && "invalid cost");
1398 }
1399 
1400 /// Set this cost to a losing value.
1401 void Cost::Lose() {
1404  C.AddRecCost = std::numeric_limits<unsigned>::max();
1405  C.NumIVMuls = std::numeric_limits<unsigned>::max();
1406  C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1408  C.SetupCost = std::numeric_limits<unsigned>::max();
1409  C.ScaleCost = std::numeric_limits<unsigned>::max();
1410 }
1411 
1412 /// Choose the lower cost.
1413 bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
1414  if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1415  C.Insns != Other.C.Insns)
1416  return C.Insns < Other.C.Insns;
1417  return TTI.isLSRCostLess(C, Other.C);
1418 }
1419 
1420 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1421 void Cost::print(raw_ostream &OS) const {
1422  if (InsnsCost)
1423  OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1424  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1425  if (C.AddRecCost != 0)
1426  OS << ", with addrec cost " << C.AddRecCost;
1427  if (C.NumIVMuls != 0)
1428  OS << ", plus " << C.NumIVMuls << " IV mul"
1429  << (C.NumIVMuls == 1 ? "" : "s");
1430  if (C.NumBaseAdds != 0)
1431  OS << ", plus " << C.NumBaseAdds << " base add"
1432  << (C.NumBaseAdds == 1 ? "" : "s");
1433  if (C.ScaleCost != 0)
1434  OS << ", plus " << C.ScaleCost << " scale cost";
1435  if (C.ImmCost != 0)
1436  OS << ", plus " << C.ImmCost << " imm cost";
1437  if (C.SetupCost != 0)
1438  OS << ", plus " << C.SetupCost << " setup cost";
1439 }
1440 
1441 LLVM_DUMP_METHOD void Cost::dump() const {
1442  print(errs()); errs() << '\n';
1443 }
1444 #endif
1445 
1446 /// Test whether this fixup always uses its value outside of the given loop.
1447 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1448  // PHI nodes use their value in their incoming blocks.
1449  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1450  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1451  if (PN->getIncomingValue(i) == OperandValToReplace &&
1452  L->contains(PN->getIncomingBlock(i)))
1453  return false;
1454  return true;
1455  }
1456 
1457  return !L->contains(UserInst);
1458 }
1459 
1460 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1461 void LSRFixup::print(raw_ostream &OS) const {
1462  OS << "UserInst=";
1463  // Store is common and interesting enough to be worth special-casing.
1464  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1465  OS << "store ";
1466  Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1467  } else if (UserInst->getType()->isVoidTy())
1468  OS << UserInst->getOpcodeName();
1469  else
1470  UserInst->printAsOperand(OS, /*PrintType=*/false);
1471 
1472  OS << ", OperandValToReplace=";
1473  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1474 
1475  for (const Loop *PIL : PostIncLoops) {
1476  OS << ", PostIncLoop=";
1477  PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1478  }
1479 
1480  if (Offset != 0)
1481  OS << ", Offset=" << Offset;
1482 }
1483 
1484 LLVM_DUMP_METHOD void LSRFixup::dump() const {
1485  print(errs()); errs() << '\n';
1486 }
1487 #endif
1488 
1489 /// Test whether this use as a formula which has the same registers as the given
1490 /// formula.
1491 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1492  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1493  if (F.ScaledReg) Key.push_back(F.ScaledReg);
1494  // Unstable sort by host order ok, because this is only used for uniquifying.
1495  llvm::sort(Key);
1496  return Uniquifier.count(Key);
1497 }
1498 
1499 /// The function returns a probability of selecting formula without Reg.
1500 float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1501  unsigned FNum = 0;
1502  for (const Formula &F : Formulae)
1503  if (F.referencesReg(Reg))
1504  FNum++;
1505  return ((float)(Formulae.size() - FNum)) / Formulae.size();
1506 }
1507 
1508 /// If the given formula has not yet been inserted, add it to the list, and
1509 /// return true. Return false otherwise. The formula must be in canonical form.
1510 bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1511  assert(F.isCanonical(L) && "Invalid canonical representation");
1512 
1513  if (!Formulae.empty() && RigidFormula)
1514  return false;
1515 
1516  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1517  if (F.ScaledReg) Key.push_back(F.ScaledReg);
1518  // Unstable sort by host order ok, because this is only used for uniquifying.
1519  llvm::sort(Key);
1520 
1521  if (!Uniquifier.insert(Key).second)
1522  return false;
1523 
1524  // Using a register to hold the value of 0 is not profitable.
1525  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1526  "Zero allocated in a scaled register!");
1527 #ifndef NDEBUG
1528  for (const SCEV *BaseReg : F.BaseRegs)
1529  assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1530 #endif
1531 
1532  // Add the formula to the list.
1533  Formulae.push_back(F);
1534 
1535  // Record registers now being used by this use.
1536  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1537  if (F.ScaledReg)
1538  Regs.insert(F.ScaledReg);
1539 
1540  return true;
1541 }
1542 
1543 /// Remove the given formula from this use's list.
1544 void LSRUse::DeleteFormula(Formula &F) {
1545  if (&F != &Formulae.back())
1546  std::swap(F, Formulae.back());
1547  Formulae.pop_back();
1548 }
1549 
1550 /// Recompute the Regs field, and update RegUses.
1551 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1552  // Now that we've filtered out some formulae, recompute the Regs set.
1553  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1554  Regs.clear();
1555  for (const Formula &F : Formulae) {
1556  if (F.ScaledReg) Regs.insert(F.ScaledReg);
1557  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1558  }
1559 
1560  // Update the RegTracker.
1561  for (const SCEV *S : OldRegs)
1562  if (!Regs.count(S))
1563  RegUses.dropRegister(S, LUIdx);
1564 }
1565 
1566 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1567 void LSRUse::print(raw_ostream &OS) const {
1568  OS << "LSR Use: Kind=";
1569  switch (Kind) {
1570  case Basic: OS << "Basic"; break;
1571  case Special: OS << "Special"; break;
1572  case ICmpZero: OS << "ICmpZero"; break;
1573  case Address:
1574  OS << "Address of ";
1575  if (AccessTy.MemTy->isPointerTy())
1576  OS << "pointer"; // the full pointer type could be really verbose
1577  else {
1578  OS << *AccessTy.MemTy;
1579  }
1580 
1581  OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1582  }
1583 
1584  OS << ", Offsets={";
1585  bool NeedComma = false;
1586  for (const LSRFixup &Fixup : Fixups) {
1587  if (NeedComma) OS << ',';
1588  OS << Fixup.Offset;
1589  NeedComma = true;
1590  }
1591  OS << '}';
1592 
1593  if (AllFixupsOutsideLoop)
1594  OS << ", all-fixups-outside-loop";
1595 
1596  if (WidestFixupType)
1597  OS << ", widest fixup type: " << *WidestFixupType;
1598 }
1599 
1600 LLVM_DUMP_METHOD void LSRUse::dump() const {
1601  print(errs()); errs() << '\n';
1602 }
1603 #endif
1604 
1606  LSRUse::KindType Kind, MemAccessTy AccessTy,
1607  GlobalValue *BaseGV, int64_t BaseOffset,
1608  bool HasBaseReg, int64_t Scale,
1609  Instruction *Fixup/*= nullptr*/) {
1610  switch (Kind) {
1611  case LSRUse::Address:
1612  return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
1613  HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
1614 
1615  case LSRUse::ICmpZero:
1616  // There's not even a target hook for querying whether it would be legal to
1617  // fold a GV into an ICmp.
1618  if (BaseGV)
1619  return false;
1620 
1621  // ICmp only has two operands; don't allow more than two non-trivial parts.
1622  if (Scale != 0 && HasBaseReg && BaseOffset != 0)
1623  return false;
1624 
1625  // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1626  // putting the scaled register in the other operand of the icmp.
1627  if (Scale != 0 && Scale != -1)
1628  return false;
1629 
1630  // If we have low-level target information, ask the target if it can fold an
1631  // integer immediate on an icmp.
1632  if (BaseOffset != 0) {
1633  // We have one of:
1634  // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1635  // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1636  // Offs is the ICmp immediate.
1637  if (Scale == 0)
1638  // The cast does the right thing with
1639  // std::numeric_limits<int64_t>::min().
1640  BaseOffset = -(uint64_t)BaseOffset;
1641  return TTI.isLegalICmpImmediate(BaseOffset);
1642  }
1643 
1644  // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1645  return true;
1646 
1647  case LSRUse::Basic:
1648  // Only handle single-register values.
1649  return !BaseGV && Scale == 0 && BaseOffset == 0;
1650 
1651  case LSRUse::Special:
1652  // Special case Basic to handle -1 scales.
1653  return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
1654  }
1655 
1656  llvm_unreachable("Invalid LSRUse Kind!");
1657 }
1658 
1660  int64_t MinOffset, int64_t MaxOffset,
1661  LSRUse::KindType Kind, MemAccessTy AccessTy,
1662  GlobalValue *BaseGV, int64_t BaseOffset,
1663  bool HasBaseReg, int64_t Scale) {
1664  // Check for overflow.
1665  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1666  (MinOffset > 0))
1667  return false;
1668  MinOffset = (uint64_t)BaseOffset + MinOffset;
1669  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1670  (MaxOffset > 0))
1671  return false;
1672  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1673 
1674  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1675  HasBaseReg, Scale) &&
1676  isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1677  HasBaseReg, Scale);
1678 }
1679 
1681  int64_t MinOffset, int64_t MaxOffset,
1682  LSRUse::KindType Kind, MemAccessTy AccessTy,
1683  const Formula &F, const Loop &L) {
1684  // For the purpose of isAMCompletelyFolded either having a canonical formula
1685  // or a scale not equal to zero is correct.
1686  // Problems may arise from non canonical formulae having a scale == 0.
1687  // Strictly speaking it would best to just rely on canonical formulae.
1688  // However, when we generate the scaled formulae, we first check that the
1689  // scaling factor is profitable before computing the actual ScaledReg for
1690  // compile time sake.
1691  assert((F.isCanonical(L) || F.Scale != 0));
1692  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1693  F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1694 }
1695 
1696 /// Test whether we know how to expand the current formula.
1697 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1698  int64_t MaxOffset, LSRUse::KindType Kind,
1699  MemAccessTy AccessTy, GlobalValue *BaseGV,
1700  int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
1701  // We know how to expand completely foldable formulae.
1702  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1703  BaseOffset, HasBaseReg, Scale) ||
1704  // Or formulae that use a base register produced by a sum of base
1705  // registers.
1706  (Scale == 1 &&
1707  isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1708  BaseGV, BaseOffset, true, 0));
1709 }
1710 
1711 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1712  int64_t MaxOffset, LSRUse::KindType Kind,
1713  MemAccessTy AccessTy, const Formula &F) {
1714  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1715  F.BaseOffset, F.HasBaseReg, F.Scale);
1716 }
1717 
1719  const LSRUse &LU, const Formula &F) {
1720  // Target may want to look at the user instructions.
1721  if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1722  for (const LSRFixup &Fixup : LU.Fixups)
1723  if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1724  (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1725  F.Scale, Fixup.UserInst))
1726  return false;
1727  return true;
1728  }
1729 
1730  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1731  LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1732  F.Scale);
1733 }
1734 
1735 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1736  const LSRUse &LU, const Formula &F,
1737  const Loop &L) {
1738  if (!F.Scale)
1739  return 0;
1740 
1741  // If the use is not completely folded in that instruction, we will have to
1742  // pay an extra cost only for scale != 1.
1743  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1744  LU.AccessTy, F, L))
1745  return F.Scale != 1;
1746 
1747  switch (LU.Kind) {
1748  case LSRUse::Address: {
1749  // Check the scaling factor cost with both the min and max offsets.
1750  int ScaleCostMinOffset = TTI.getScalingFactorCost(
1751  LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
1752  F.Scale, LU.AccessTy.AddrSpace);
1753  int ScaleCostMaxOffset = TTI.getScalingFactorCost(
1754  LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
1755  F.Scale, LU.AccessTy.AddrSpace);
1756 
1757  assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
1758  "Legal addressing mode has an illegal cost!");
1759  return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1760  }
1761  case LSRUse::ICmpZero:
1762  case LSRUse::Basic:
1763  case LSRUse::Special:
1764  // The use is completely folded, i.e., everything is folded into the
1765  // instruction.
1766  return 0;
1767  }
1768 
1769  llvm_unreachable("Invalid LSRUse Kind!");
1770 }
1771 
1772 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1773  LSRUse::KindType Kind, MemAccessTy AccessTy,
1774  GlobalValue *BaseGV, int64_t BaseOffset,
1775  bool HasBaseReg) {
1776  // Fast-path: zero is always foldable.
1777  if (BaseOffset == 0 && !BaseGV) return true;
1778 
1779  // Conservatively, create an address with an immediate and a
1780  // base and a scale.
1781  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1782 
1783  // Canonicalize a scale of 1 to a base register if the formula doesn't
1784  // already have a base register.
1785  if (!HasBaseReg && Scale == 1) {
1786  Scale = 0;
1787  HasBaseReg = true;
1788  }
1789 
1790  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
1791  HasBaseReg, Scale);
1792 }
1793 
1794 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1795  ScalarEvolution &SE, int64_t MinOffset,
1796  int64_t MaxOffset, LSRUse::KindType Kind,
1797  MemAccessTy AccessTy, const SCEV *S,
1798  bool HasBaseReg) {
1799  // Fast-path: zero is always foldable.
1800  if (S->isZero()) return true;
1801 
1802  // Conservatively, create an address with an immediate and a
1803  // base and a scale.
1804  int64_t BaseOffset = ExtractImmediate(S, SE);
1805  GlobalValue *BaseGV = ExtractSymbol(S, SE);
1806 
1807  // If there's anything else involved, it's not foldable.
1808  if (!S->isZero()) return false;
1809 
1810  // Fast-path: zero is always foldable.
1811  if (BaseOffset == 0 && !BaseGV) return true;
1812 
1813  // Conservatively, create an address with an immediate and a
1814  // base and a scale.
1815  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1816 
1817  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1818  BaseOffset, HasBaseReg, Scale);
1819 }
1820 
1821 namespace {
1822 
1823 /// An individual increment in a Chain of IV increments. Relate an IV user to
1824 /// an expression that computes the IV it uses from the IV used by the previous
1825 /// link in the Chain.
1826 ///
1827 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1828 /// original IVOperand. The head of the chain's IVOperand is only valid during
1829 /// chain collection, before LSR replaces IV users. During chain generation,
1830 /// IncExpr can be used to find the new IVOperand that computes the same
1831 /// expression.
1832 struct IVInc {
1833  Instruction *UserInst;
1834  Value* IVOperand;
1835  const SCEV *IncExpr;
1836 
1837  IVInc(Instruction *U, Value *O, const SCEV *E)
1838  : UserInst(U), IVOperand(O), IncExpr(E) {}
1839 };
1840 
1841 // The list of IV increments in program order. We typically add the head of a
1842 // chain without finding subsequent links.
1843 struct IVChain {
1844  SmallVector<IVInc, 1> Incs;
1845  const SCEV *ExprBase = nullptr;
1846 
1847  IVChain() = default;
1848  IVChain(const IVInc &Head, const SCEV *Base)
1849  : Incs(1, Head), ExprBase(Base) {}
1850 
1851  using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
1852 
1853  // Return the first increment in the chain.
1854  const_iterator begin() const {
1855  assert(!Incs.empty());
1856  return std::next(Incs.begin());
1857  }
1858  const_iterator end() const {
1859  return Incs.end();
1860  }
1861 
1862  // Returns true if this chain contains any increments.
1863  bool hasIncs() const { return Incs.size() >= 2; }
1864 
1865  // Add an IVInc to the end of this chain.
1866  void add(const IVInc &X) { Incs.push_back(X); }
1867 
1868  // Returns the last UserInst in the chain.
1869  Instruction *tailUserInst() const { return Incs.back().UserInst; }
1870 
1871  // Returns true if IncExpr can be profitably added to this chain.
1872  bool isProfitableIncrement(const SCEV *OperExpr,
1873  const SCEV *IncExpr,
1874  ScalarEvolution&);
1875 };
1876 
1877 /// Helper for CollectChains to track multiple IV increment uses. Distinguish
1878 /// between FarUsers that definitely cross IV increments and NearUsers that may
1879 /// be used between IV increments.
1880 struct ChainUsers {
1882  SmallPtrSet<Instruction*, 4> NearUsers;
1883 };
1884 
1885 /// This class holds state for the main loop strength reduction logic.
1886 class LSRInstance {
1887  IVUsers &IU;
1888  ScalarEvolution &SE;
1889  DominatorTree &DT;
1890  LoopInfo &LI;
1891  const TargetTransformInfo &TTI;
1892  Loop *const L;
1893  bool Changed = false;
1894 
1895  /// This is the insert position that the current loop's induction variable
1896  /// increment should be placed. In simple loops, this is the latch block's
1897  /// terminator. But in more complicated cases, this is a position which will
1898  /// dominate all the in-loop post-increment users.
1899  Instruction *IVIncInsertPos = nullptr;
1900 
1901  /// Interesting factors between use strides.
1902  ///
1903  /// We explicitly use a SetVector which contains a SmallSet, instead of the
1904  /// default, a SmallDenseSet, because we need to use the full range of
1905  /// int64_ts, and there's currently no good way of doing that with
1906  /// SmallDenseSet.
1908 
1909  /// Interesting use types, to facilitate truncation reuse.
1911 
1912  /// The list of interesting uses.
1914 
1915  /// Track which uses use which register candidates.
1916  RegUseTracker RegUses;
1917 
1918  // Limit the number of chains to avoid quadratic behavior. We don't expect to
1919  // have more than a few IV increment chains in a loop. Missing a Chain falls
1920  // back to normal LSR behavior for those uses.
1921  static const unsigned MaxChains = 8;
1922 
1923  /// IV users can form a chain of IV increments.
1925 
1926  /// IV users that belong to profitable IVChains.
1928 
1929  void OptimizeShadowIV();
1930  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1931  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1932  void OptimizeLoopTermCond();
1933 
1934  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
1935  SmallVectorImpl<ChainUsers> &ChainUsersVec);
1936  void FinalizeChain(IVChain &Chain);
1937  void CollectChains();
1938  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
1939  SmallVectorImpl<WeakTrackingVH> &DeadInsts);
1940 
1941  void CollectInterestingTypesAndFactors();
1942  void CollectFixupsAndInitialFormulae();
1943 
1944  // Support for sharing of LSRUses between LSRFixups.
1946  UseMapTy UseMap;
1947 
1948  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1949  LSRUse::KindType Kind, MemAccessTy AccessTy);
1950 
1951  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
1952  MemAccessTy AccessTy);
1953 
1954  void DeleteUse(LSRUse &LU, size_t LUIdx);
1955 
1956  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1957 
1958  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1959  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1960  void CountRegisters(const Formula &F, size_t LUIdx);
1961  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1962 
1963  void CollectLoopInvariantFixupsAndFormulae();
1964 
1965  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1966  unsigned Depth = 0);
1967 
1968  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
1969  const Formula &Base, unsigned Depth,
1970  size_t Idx, bool IsScaledReg = false);
1971  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1972  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1973  const Formula &Base, size_t Idx,
1974  bool IsScaledReg = false);
1975  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1976  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1977  const Formula &Base,
1978  const SmallVectorImpl<int64_t> &Worklist,
1979  size_t Idx, bool IsScaledReg = false);
1980  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1981  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1982  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1983  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1984  void GenerateCrossUseConstantOffsets();
1985  void GenerateAllReuseFormulae();
1986 
1987  void FilterOutUndesirableDedicatedRegisters();
1988 
1989  size_t EstimateSearchSpaceComplexity() const;
1990  void NarrowSearchSpaceByDetectingSupersets();
1991  void NarrowSearchSpaceByCollapsingUnrolledCode();
1992  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1993  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
1994  void NarrowSearchSpaceByDeletingCostlyFormulas();
1995  void NarrowSearchSpaceByPickingWinnerRegs();
1996  void NarrowSearchSpaceUsingHeuristics();
1997 
1998  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1999  Cost &SolutionCost,
2001  const Cost &CurCost,
2002  const SmallPtrSet<const SCEV *, 16> &CurRegs,
2003  DenseSet<const SCEV *> &VisitedRegs) const;
2004  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2005 
2007  HoistInsertPosition(BasicBlock::iterator IP,
2008  const SmallVectorImpl<Instruction *> &Inputs) const;
2010  AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2011  const LSRFixup &LF,
2012  const LSRUse &LU,
2013  SCEVExpander &Rewriter) const;
2014 
2015  Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2016  BasicBlock::iterator IP, SCEVExpander &Rewriter,
2017  SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2018  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2019  const Formula &F, SCEVExpander &Rewriter,
2020  SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2021  void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2022  SCEVExpander &Rewriter,
2023  SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2024  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2025 
2026 public:
2027  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2028  LoopInfo &LI, const TargetTransformInfo &TTI);
2029 
2030  bool getChanged() const { return Changed; }
2031 
2032  void print_factors_and_types(raw_ostream &OS) const;
2033  void print_fixups(raw_ostream &OS) const;
2034  void print_uses(raw_ostream &OS) const;
2035  void print(raw_ostream &OS) const;
2036  void dump() const;
2037 };
2038 
2039 } // end anonymous namespace
2040 
2041 /// If IV is used in a int-to-float cast inside the loop then try to eliminate
2042 /// the cast operation.
2043 void LSRInstance::OptimizeShadowIV() {
2044  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2045  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2046  return;
2047 
2048  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2049  UI != E; /* empty */) {
2050  IVUsers::const_iterator CandidateUI = UI;
2051  ++UI;
2052  Instruction *ShadowUse = CandidateUI->getUser();
2053  Type *DestTy = nullptr;
2054  bool IsSigned = false;
2055 
2056  /* If shadow use is a int->float cast then insert a second IV
2057  to eliminate this cast.
2058 
2059  for (unsigned i = 0; i < n; ++i)
2060  foo((double)i);
2061 
2062  is transformed into
2063 
2064  double d = 0.0;
2065  for (unsigned i = 0; i < n; ++i, ++d)
2066  foo(d);
2067  */
2068  if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2069  IsSigned = false;
2070  DestTy = UCast->getDestTy();
2071  }
2072  else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2073  IsSigned = true;
2074  DestTy = SCast->getDestTy();
2075  }
2076  if (!DestTy) continue;
2077 
2078  // If target does not support DestTy natively then do not apply
2079  // this transformation.
2080  if (!TTI.isTypeLegal(DestTy)) continue;
2081 
2082  PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2083  if (!PH) continue;
2084  if (PH->getNumIncomingValues() != 2) continue;
2085 
2086  // If the calculation in integers overflows, the result in FP type will
2087  // differ. So we only can do this transformation if we are guaranteed to not
2088  // deal with overflowing values
2089  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2090  if (!AR) continue;
2091  if (IsSigned && !AR->hasNoSignedWrap()) continue;
2092  if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2093 
2094  Type *SrcTy = PH->getType();
2095  int Mantissa = DestTy->getFPMantissaWidth();
2096  if (Mantissa == -1) continue;
2097  if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2098  continue;
2099 
2100  unsigned Entry, Latch;
2101  if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2102  Entry = 0;
2103  Latch = 1;
2104  } else {
2105  Entry = 1;
2106  Latch = 0;
2107  }
2108 
2110  if (!Init) continue;
2111  Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2112  (double)Init->getSExtValue() :
2113  (double)Init->getZExtValue());
2114 
2115  BinaryOperator *Incr =
2117  if (!Incr) continue;
2118  if (Incr->getOpcode() != Instruction::Add
2119  && Incr->getOpcode() != Instruction::Sub)
2120  continue;
2121 
2122  /* Initialize new IV, double d = 0.0 in above example. */
2123  ConstantInt *C = nullptr;
2124  if (Incr->getOperand(0) == PH)
2125  C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2126  else if (Incr->getOperand(1) == PH)
2127  C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2128  else
2129  continue;
2130 
2131  if (!C) continue;
2132 
2133  // Ignore negative constants, as the code below doesn't handle them
2134  // correctly. TODO: Remove this restriction.
2135  if (!C->getValue().isStrictlyPositive()) continue;
2136 
2137  /* Add new PHINode. */
2138  PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
2139 
2140  /* create new increment. '++d' in above example. */
2141  Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2142  BinaryOperator *NewIncr =
2143  BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
2144  Instruction::FAdd : Instruction::FSub,
2145  NewPH, CFP, "IV.S.next.", Incr);
2146 
2147  NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2148  NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2149 
2150  /* Remove cast operation */
2151  ShadowUse->replaceAllUsesWith(NewPH);
2152  ShadowUse->eraseFromParent();
2153  Changed = true;
2154  break;
2155  }
2156 }
2157 
2158 /// If Cond has an operand that is an expression of an IV, set the IV user and
2159 /// stride information and return true, otherwise return false.
2160 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2161  for (IVStrideUse &U : IU)
2162  if (U.getUser() == Cond) {
2163  // NOTE: we could handle setcc instructions with multiple uses here, but
2164  // InstCombine does it as well for simple uses, it's not clear that it
2165  // occurs enough in real life to handle.
2166  CondUse = &U;
2167  return true;
2168  }
2169  return false;
2170 }
2171 
2172 /// Rewrite the loop's terminating condition if it uses a max computation.
2173 ///
2174 /// This is a narrow solution to a specific, but acute, problem. For loops
2175 /// like this:
2176 ///
2177 /// i = 0;
2178 /// do {
2179 /// p[i] = 0.0;
2180 /// } while (++i < n);
2181 ///
2182 /// the trip count isn't just 'n', because 'n' might not be positive. And
2183 /// unfortunately this can come up even for loops where the user didn't use
2184 /// a C do-while loop. For example, seemingly well-behaved top-test loops
2185 /// will commonly be lowered like this:
2186 ///
2187 /// if (n > 0) {
2188 /// i = 0;
2189 /// do {
2190 /// p[i] = 0.0;
2191 /// } while (++i < n);
2192 /// }
2193 ///
2194 /// and then it's possible for subsequent optimization to obscure the if
2195 /// test in such a way that indvars can't find it.
2196 ///
2197 /// When indvars can't find the if test in loops like this, it creates a
2198 /// max expression, which allows it to give the loop a canonical
2199 /// induction variable:
2200 ///
2201 /// i = 0;
2202 /// max = n < 1 ? 1 : n;
2203 /// do {
2204 /// p[i] = 0.0;
2205 /// } while (++i != max);
2206 ///
2207 /// Canonical induction variables are necessary because the loop passes
2208 /// are designed around them. The most obvious example of this is the
2209 /// LoopInfo analysis, which doesn't remember trip count values. It
2210 /// expects to be able to rediscover the trip count each time it is
2211 /// needed, and it does this using a simple analysis that only succeeds if
2212 /// the loop has a canonical induction variable.
2213 ///
2214 /// However, when it comes time to generate code, the maximum operation
2215 /// can be quite costly, especially if it's inside of an outer loop.
2216 ///
2217 /// This function solves this problem by detecting this type of loop and
2218 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2219 /// the instructions for the maximum computation.
2220 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2221  // Check that the loop matches the pattern we're looking for.
2222  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2223  Cond->getPredicate() != CmpInst::ICMP_NE)
2224  return Cond;
2225 
2226  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2227  if (!Sel || !Sel->hasOneUse()) return Cond;
2228 
2229  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2230  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2231  return Cond;
2232  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2233 
2234  // Add one to the backedge-taken count to get the trip count.
2235  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2236  if (IterationCount != SE.getSCEV(Sel)) return Cond;
2237 
2238  // Check for a max calculation that matches the pattern. There's no check
2239  // for ICMP_ULE here because the comparison would be with zero, which
2240  // isn't interesting.
2242  const SCEVNAryExpr *Max = nullptr;
2243  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2244  Pred = ICmpInst::ICMP_SLE;
2245  Max = S;
2246  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2247  Pred = ICmpInst::ICMP_SLT;
2248  Max = S;
2249  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2250  Pred = ICmpInst::ICMP_ULT;
2251  Max = U;
2252  } else {
2253  // No match; bail.
2254  return Cond;
2255  }
2256 
2257  // To handle a max with more than two operands, this optimization would
2258  // require additional checking and setup.
2259  if (Max->getNumOperands() != 2)
2260  return Cond;
2261 
2262  const SCEV *MaxLHS = Max->getOperand(0);
2263  const SCEV *MaxRHS = Max->getOperand(1);
2264 
2265  // ScalarEvolution canonicalizes constants to the left. For < and >, look
2266  // for a comparison with 1. For <= and >=, a comparison with zero.
2267  if (!MaxLHS ||
2268  (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2269  return Cond;
2270 
2271  // Check the relevant induction variable for conformance to
2272  // the pattern.
2273  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2274  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2275  if (!AR || !AR->isAffine() ||
2276  AR->getStart() != One ||
2277  AR->getStepRecurrence(SE) != One)
2278  return Cond;
2279 
2280  assert(AR->getLoop() == L &&
2281  "Loop condition operand is an addrec in a different loop!");
2282 
2283  // Check the right operand of the select, and remember it, as it will
2284  // be used in the new comparison instruction.
2285  Value *NewRHS = nullptr;
2286  if (ICmpInst::isTrueWhenEqual(Pred)) {
2287  // Look for n+1, and grab n.
2288  if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2289  if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2290  if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2291  NewRHS = BO->getOperand(0);
2292  if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2293  if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2294  if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2295  NewRHS = BO->getOperand(0);
2296  if (!NewRHS)
2297  return Cond;
2298  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2299  NewRHS = Sel->getOperand(1);
2300  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2301  NewRHS = Sel->getOperand(2);
2302  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2303  NewRHS = SU->getValue();
2304  else
2305  // Max doesn't match expected pattern.
2306  return Cond;
2307 
2308  // Determine the new comparison opcode. It may be signed or unsigned,
2309  // and the original comparison may be either equality or inequality.
2310  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2311  Pred = CmpInst::getInversePredicate(Pred);
2312 
2313  // Ok, everything looks ok to change the condition into an SLT or SGE and
2314  // delete the max calculation.
2315  ICmpInst *NewCond =
2316  new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
2317 
2318  // Delete the max calculation instructions.
2319  Cond->replaceAllUsesWith(NewCond);
2320  CondUse->setUser(NewCond);
2321  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2322  Cond->eraseFromParent();
2323  Sel->eraseFromParent();
2324  if (Cmp->use_empty())
2325  Cmp->eraseFromParent();
2326  return NewCond;
2327 }
2328 
2329 /// Change loop terminating condition to use the postinc iv when possible.
2330 void
2331 LSRInstance::OptimizeLoopTermCond() {
2333 
2334  // We need a different set of heuristics for rotated and non-rotated loops.
2335  // If a loop is rotated then the latch is also the backedge, so inserting
2336  // post-inc expressions just before the latch is ideal. To reduce live ranges
2337  // it also makes sense to rewrite terminating conditions to use post-inc
2338  // expressions.
2339  //
2340  // If the loop is not rotated then the latch is not a backedge; the latch
2341  // check is done in the loop head. Adding post-inc expressions before the
2342  // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2343  // in the loop body. In this case we do *not* want to use post-inc expressions
2344  // in the latch check, and we want to insert post-inc expressions before
2345  // the backedge.
2346  BasicBlock *LatchBlock = L->getLoopLatch();
2347  SmallVector<BasicBlock*, 8> ExitingBlocks;
2348  L->getExitingBlocks(ExitingBlocks);
2349  if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
2350  return LatchBlock != BB;
2351  })) {
2352  // The backedge doesn't exit the loop; treat this as a head-tested loop.
2353  IVIncInsertPos = LatchBlock->getTerminator();
2354  return;
2355  }
2356 
2357  // Otherwise treat this as a rotated loop.
2358  for (BasicBlock *ExitingBlock : ExitingBlocks) {
2359  // Get the terminating condition for the loop if possible. If we
2360  // can, we want to change it to use a post-incremented version of its
2361  // induction variable, to allow coalescing the live ranges for the IV into
2362  // one register value.
2363 
2364  BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2365  if (!TermBr)
2366  continue;
2367  // FIXME: Overly conservative, termination condition could be an 'or' etc..
2368  if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2369  continue;
2370 
2371  // Search IVUsesByStride to find Cond's IVUse if there is one.
2372  IVStrideUse *CondUse = nullptr;
2373  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2374  if (!FindIVUserForCond(Cond, CondUse))
2375  continue;
2376 
2377  // If the trip count is computed in terms of a max (due to ScalarEvolution
2378  // being unable to find a sufficient guard, for example), change the loop
2379  // comparison to use SLT or ULT instead of NE.
2380  // One consequence of doing this now is that it disrupts the count-down
2381  // optimization. That's not always a bad thing though, because in such
2382  // cases it may still be worthwhile to avoid a max.
2383  Cond = OptimizeMax(Cond, CondUse);
2384 
2385  // If this exiting block dominates the latch block, it may also use
2386  // the post-inc value if it won't be shared with other uses.
2387  // Check for dominance.
2388  if (!DT.dominates(ExitingBlock, LatchBlock))
2389  continue;
2390 
2391  // Conservatively avoid trying to use the post-inc value in non-latch
2392  // exits if there may be pre-inc users in intervening blocks.
2393  if (LatchBlock != ExitingBlock)
2394  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2395  // Test if the use is reachable from the exiting block. This dominator
2396  // query is a conservative approximation of reachability.
2397  if (&*UI != CondUse &&
2398  !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2399  // Conservatively assume there may be reuse if the quotient of their
2400  // strides could be a legal scale.
2401  const SCEV *A = IU.getStride(*CondUse, L);
2402  const SCEV *B = IU.getStride(*UI, L);
2403  if (!A || !B) continue;
2404  if (SE.getTypeSizeInBits(A->getType()) !=
2405  SE.getTypeSizeInBits(B->getType())) {
2406  if (SE.getTypeSizeInBits(A->getType()) >
2407  SE.getTypeSizeInBits(B->getType()))
2408  B = SE.getSignExtendExpr(B, A->getType());
2409  else
2410  A = SE.getSignExtendExpr(A, B->getType());
2411  }
2412  if (const SCEVConstant *D =
2413  dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2414  const ConstantInt *C = D->getValue();
2415  // Stride of one or negative one can have reuse with non-addresses.
2416  if (C->isOne() || C->isMinusOne())
2417  goto decline_post_inc;
2418  // Avoid weird situations.
2419  if (C->getValue().getMinSignedBits() >= 64 ||
2420  C->getValue().isMinSignedValue())
2421  goto decline_post_inc;
2422  // Check for possible scaled-address reuse.
2423  if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2424  MemAccessTy AccessTy = getAccessType(
2425  TTI, UI->getUser(), UI->getOperandValToReplace());
2426  int64_t Scale = C->getSExtValue();
2427  if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2428  /*BaseOffset=*/0,
2429  /*HasBaseReg=*/false, Scale,
2430  AccessTy.AddrSpace))
2431  goto decline_post_inc;
2432  Scale = -Scale;
2433  if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2434  /*BaseOffset=*/0,
2435  /*HasBaseReg=*/false, Scale,
2436  AccessTy.AddrSpace))
2437  goto decline_post_inc;
2438  }
2439  }
2440  }
2441 
2442  LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2443  << *Cond << '\n');
2444 
2445  // It's possible for the setcc instruction to be anywhere in the loop, and
2446  // possible for it to have multiple users. If it is not immediately before
2447  // the exiting block branch, move it.
2448  if (&*++BasicBlock::iterator(Cond) != TermBr) {
2449  if (Cond->hasOneUse()) {
2450  Cond->moveBefore(TermBr);
2451  } else {
2452  // Clone the terminating condition and insert into the loopend.
2453  ICmpInst *OldCond = Cond;
2454  Cond = cast<ICmpInst>(Cond->clone());
2455  Cond->setName(L->getHeader()->getName() + ".termcond");
2456  ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
2457 
2458  // Clone the IVUse, as the old use still exists!
2459  CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2460  TermBr->replaceUsesOfWith(OldCond, Cond);
2461  }
2462  }
2463 
2464  // If we get to here, we know that we can transform the setcc instruction to
2465  // use the post-incremented version of the IV, allowing us to coalesce the
2466  // live ranges for the IV correctly.
2467  CondUse->transformToPostInc(L);
2468  Changed = true;
2469 
2470  PostIncs.insert(Cond);
2471  decline_post_inc:;
2472  }
2473 
2474  // Determine an insertion point for the loop induction variable increment. It
2475  // must dominate all the post-inc comparisons we just set up, and it must
2476  // dominate the loop latch edge.
2477  IVIncInsertPos = L->getLoopLatch()->getTerminator();
2478  for (Instruction *Inst : PostIncs) {
2479  BasicBlock *BB =
2480  DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
2481  Inst->getParent());
2482  if (BB == Inst->getParent())
2483  IVIncInsertPos = Inst;
2484  else if (BB != IVIncInsertPos->getParent())
2485  IVIncInsertPos = BB->getTerminator();
2486  }
2487 }
2488 
2489 /// Determine if the given use can accommodate a fixup at the given offset and
2490 /// other details. If so, update the use and return true.
2491 bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
2492  bool HasBaseReg, LSRUse::KindType Kind,
2493  MemAccessTy AccessTy) {
2494  int64_t NewMinOffset = LU.MinOffset;
2495  int64_t NewMaxOffset = LU.MaxOffset;
2496  MemAccessTy NewAccessTy = AccessTy;
2497 
2498  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2499  // something conservative, however this can pessimize in the case that one of
2500  // the uses will have all its uses outside the loop, for example.
2501  if (LU.Kind != Kind)
2502  return false;
2503 
2504  // Check for a mismatched access type, and fall back conservatively as needed.
2505  // TODO: Be less conservative when the type is similar and can use the same
2506  // addressing modes.
2507  if (Kind == LSRUse::Address) {
2508  if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2509  NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2510  AccessTy.AddrSpace);
2511  }
2512  }
2513 
2514  // Conservatively assume HasBaseReg is true for now.
2515  if (NewOffset < LU.MinOffset) {
2516  if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2517  LU.MaxOffset - NewOffset, HasBaseReg))
2518  return false;
2519  NewMinOffset = NewOffset;
2520  } else if (NewOffset > LU.MaxOffset) {
2521  if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2522  NewOffset - LU.MinOffset, HasBaseReg))
2523  return false;
2524  NewMaxOffset = NewOffset;
2525  }
2526 
2527  // Update the use.
2528  LU.MinOffset = NewMinOffset;
2529  LU.MaxOffset = NewMaxOffset;
2530  LU.AccessTy = NewAccessTy;
2531  return true;
2532 }
2533 
2534 /// Return an LSRUse index and an offset value for a fixup which needs the given
2535 /// expression, with the given kind and optional access type. Either reuse an
2536 /// existing use or create a new one, as needed.
2537 std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
2538  LSRUse::KindType Kind,
2539  MemAccessTy AccessTy) {
2540  const SCEV *Copy = Expr;
2541  int64_t Offset = ExtractImmediate(Expr, SE);
2542 
2543  // Basic uses can't accept any offset, for example.
2544  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2545  Offset, /*HasBaseReg=*/ true)) {
2546  Expr = Copy;
2547  Offset = 0;
2548  }
2549 
2550  std::pair<UseMapTy::iterator, bool> P =
2551  UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2552  if (!P.second) {
2553  // A use already existed with this base.
2554  size_t LUIdx = P.first->second;
2555  LSRUse &LU = Uses[LUIdx];
2556  if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2557  // Reuse this use.
2558  return std::make_pair(LUIdx, Offset);
2559  }
2560 
2561  // Create a new use.
2562  size_t LUIdx = Uses.size();
2563  P.first->second = LUIdx;
2564  Uses.push_back(LSRUse(Kind, AccessTy));
2565  LSRUse &LU = Uses[LUIdx];
2566 
2567  LU.MinOffset = Offset;
2568  LU.MaxOffset = Offset;
2569  return std::make_pair(LUIdx, Offset);
2570 }
2571 
2572 /// Delete the given use from the Uses list.
2573 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2574  if (&LU != &Uses.back())
2575  std::swap(LU, Uses.back());
2576  Uses.pop_back();
2577 
2578  // Update RegUses.
2579  RegUses.swapAndDropUse(LUIdx, Uses.size());
2580 }
2581 
2582 /// Look for a use distinct from OrigLU which is has a formula that has the same
2583 /// registers as the given formula.
2584 LSRUse *
2585 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2586  const LSRUse &OrigLU) {
2587  // Search all uses for the formula. This could be more clever.
2588  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2589  LSRUse &LU = Uses[LUIdx];
2590  // Check whether this use is close enough to OrigLU, to see whether it's
2591  // worthwhile looking through its formulae.
2592  // Ignore ICmpZero uses because they may contain formulae generated by
2593  // GenerateICmpZeroScales, in which case adding fixup offsets may
2594  // be invalid.
2595  if (&LU != &OrigLU &&
2596  LU.Kind != LSRUse::ICmpZero &&
2597  LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2598  LU.WidestFixupType == OrigLU.WidestFixupType &&
2599  LU.HasFormulaWithSameRegs(OrigF)) {
2600  // Scan through this use's formulae.
2601  for (const Formula &F : LU.Formulae) {
2602  // Check to see if this formula has the same registers and symbols
2603  // as OrigF.
2604  if (F.BaseRegs == OrigF.BaseRegs &&
2605  F.ScaledReg == OrigF.ScaledReg &&
2606  F.BaseGV == OrigF.BaseGV &&
2607  F.Scale == OrigF.Scale &&
2608  F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2609  if (F.BaseOffset == 0)
2610  return &LU;
2611  // This is the formula where all the registers and symbols matched;
2612  // there aren't going to be any others. Since we declined it, we
2613  // can skip the rest of the formulae and proceed to the next LSRUse.
2614  break;
2615  }
2616  }
2617  }
2618  }
2619 
2620  // Nothing looked good.
2621  return nullptr;
2622 }
2623 
2624 void LSRInstance::CollectInterestingTypesAndFactors() {
2626 
2627  // Collect interesting types and strides.
2629  for (const IVStrideUse &U : IU) {
2630  const SCEV *Expr = IU.getExpr(U);
2631 
2632  // Collect interesting types.
2633  Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2634 
2635  // Add strides for mentioned loops.
2636  Worklist.push_back(Expr);
2637  do {
2638  const SCEV *S = Worklist.pop_back_val();
2639  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2640  if (AR->getLoop() == L)
2641  Strides.insert(AR->getStepRecurrence(SE));
2642  Worklist.push_back(AR->getStart());
2643  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2644  Worklist.append(Add->op_begin(), Add->op_end());
2645  }
2646  } while (!Worklist.empty());
2647  }
2648 
2649  // Compute interesting factors from the set of interesting strides.
2651  I = Strides.begin(), E = Strides.end(); I != E; ++I)
2653  std::next(I); NewStrideIter != E; ++NewStrideIter) {
2654  const SCEV *OldStride = *I;
2655  const SCEV *NewStride = *NewStrideIter;
2656 
2657  if (SE.getTypeSizeInBits(OldStride->getType()) !=
2658  SE.getTypeSizeInBits(NewStride->getType())) {
2659  if (SE.getTypeSizeInBits(OldStride->getType()) >
2660  SE.getTypeSizeInBits(NewStride->getType()))
2661  NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2662  else
2663  OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2664  }
2665  if (const SCEVConstant *Factor =
2666  dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2667  SE, true))) {
2668  if (Factor->getAPInt().getMinSignedBits() <= 64)
2669  Factors.insert(Factor->getAPInt().getSExtValue());
2670  } else if (const SCEVConstant *Factor =
2671  dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2672  NewStride,
2673  SE, true))) {
2674  if (Factor->getAPInt().getMinSignedBits() <= 64)
2675  Factors.insert(Factor->getAPInt().getSExtValue());
2676  }
2677  }
2678 
2679  // If all uses use the same type, don't bother looking for truncation-based
2680  // reuse.
2681  if (Types.size() == 1)
2682  Types.clear();
2683 
2684  LLVM_DEBUG(print_factors_and_types(dbgs()));
2685 }
2686 
2687 /// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2688 /// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2689 /// IVStrideUses, we could partially skip this.
2690 static User::op_iterator
2692  Loop *L, ScalarEvolution &SE) {
2693  for(; OI != OE; ++OI) {
2694  if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2695  if (!SE.isSCEVable(Oper->getType()))
2696  continue;
2697 
2698  if (const SCEVAddRecExpr *AR =
2699  dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2700  if (AR->getLoop() == L)
2701  break;
2702  }
2703  }
2704  }
2705  return OI;
2706 }
2707 
2708 /// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2709 /// a convenient helper.
2710 static Value *getWideOperand(Value *Oper) {
2711  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2712  return Trunc->getOperand(0);
2713  return Oper;
2714 }
2715 
2716 /// Return true if we allow an IV chain to include both types.
2717 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
2718  Type *LType = LVal->getType();
2719  Type *RType = RVal->getType();
2720  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
2721  // Different address spaces means (possibly)
2722  // different types of the pointer implementation,
2723  // e.g. i16 vs i32 so disallow that.
2724  (LType->getPointerAddressSpace() ==
2725  RType->getPointerAddressSpace()));
2726 }
2727 
2728 /// Return an approximation of this SCEV expression's "base", or NULL for any
2729 /// constant. Returning the expression itself is conservative. Returning a
2730 /// deeper subexpression is more precise and valid as long as it isn't less
2731 /// complex than another subexpression. For expressions involving multiple
2732 /// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2733 /// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2734 /// IVInc==b-a.
2735 ///
2736 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2737 /// SCEVUnknown, we simply return the rightmost SCEV operand.
2738 static const SCEV *getExprBase(const SCEV *S) {
2739  switch (S->getSCEVType()) {
2740  default: // uncluding scUnknown.
2741  return S;
2742  case scConstant:
2743  return nullptr;
2744  case scTruncate:
2745  return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2746  case scZeroExtend:
2747  return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2748  case scSignExtend:
2749  return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2750  case scAddExpr: {
2751  // Skip over scaled operands (scMulExpr) to follow add operands as long as
2752  // there's nothing more complex.
2753  // FIXME: not sure if we want to recognize negation.
2754  const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2755  for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
2756  E(Add->op_begin()); I != E; ++I) {
2757  const SCEV *SubExpr = *I;
2758  if (SubExpr->getSCEVType() == scAddExpr)
2759  return getExprBase(SubExpr);
2760 
2761  if (SubExpr->getSCEVType() != scMulExpr)
2762  return SubExpr;
2763  }
2764  return S; // all operands are scaled, be conservative.
2765  }
2766  case scAddRecExpr:
2767  return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2768  }
2769 }
2770 
2771 /// Return true if the chain increment is profitable to expand into a loop
2772 /// invariant value, which may require its own register. A profitable chain
2773 /// increment will be an offset relative to the same base. We allow such offsets
2774 /// to potentially be used as chain increment as long as it's not obviously
2775 /// expensive to expand using real instructions.
2776 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2777  const SCEV *IncExpr,
2778  ScalarEvolution &SE) {
2779  // Aggressively form chains when -stress-ivchain.
2780  if (StressIVChain)
2781  return true;
2782 
2783  // Do not replace a constant offset from IV head with a nonconstant IV
2784  // increment.
2785  if (!isa<SCEVConstant>(IncExpr)) {
2786  const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2787  if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2788  return false;
2789  }
2790 
2791  SmallPtrSet<const SCEV*, 8> Processed;
2792  return !isHighCostExpansion(IncExpr, Processed, SE);
2793 }
2794 
2795 /// Return true if the number of registers needed for the chain is estimated to
2796 /// be less than the number required for the individual IV users. First prohibit
2797 /// any IV users that keep the IV live across increments (the Users set should
2798 /// be empty). Next count the number and type of increments in the chain.
2799 ///
2800 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2801 /// effectively use postinc addressing modes. Only consider it profitable it the
2802 /// increments can be computed in fewer registers when chained.
2803 ///
2804 /// TODO: Consider IVInc free if it's already used in another chains.
2805 static bool
2807  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
2808  if (StressIVChain)
2809  return true;
2810 
2811  if (!Chain.hasIncs())
2812  return false;
2813 
2814  if (!Users.empty()) {
2815  LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2816  for (Instruction *Inst
2817  : Users) { dbgs() << " " << *Inst << "\n"; });
2818  return false;
2819  }
2820  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2821 
2822  // The chain itself may require a register, so intialize cost to 1.
2823  int cost = 1;
2824 
2825  // A complete chain likely eliminates the need for keeping the original IV in
2826  // a register. LSR does not currently know how to form a complete chain unless
2827  // the header phi already exists.
2828  if (isa<PHINode>(Chain.tailUserInst())
2829  && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2830  --cost;
2831  }
2832  const SCEV *LastIncExpr = nullptr;
2833  unsigned NumConstIncrements = 0;
2834  unsigned NumVarIncrements = 0;
2835  unsigned NumReusedIncrements = 0;
2836  for (const IVInc &Inc : Chain) {
2837  if (Inc.IncExpr->isZero())
2838  continue;
2839 
2840  // Incrementing by zero or some constant is neutral. We assume constants can
2841  // be folded into an addressing mode or an add's immediate operand.
2842  if (isa<SCEVConstant>(Inc.IncExpr)) {
2843  ++NumConstIncrements;
2844  continue;
2845  }
2846 
2847  if (Inc.IncExpr == LastIncExpr)
2848  ++NumReusedIncrements;
2849  else
2850  ++NumVarIncrements;
2851 
2852  LastIncExpr = Inc.IncExpr;
2853  }
2854  // An IV chain with a single increment is handled by LSR's postinc
2855  // uses. However, a chain with multiple increments requires keeping the IV's
2856  // value live longer than it needs to be if chained.
2857  if (NumConstIncrements > 1)
2858  --cost;
2859 
2860  // Materializing increment expressions in the preheader that didn't exist in
2861  // the original code may cost a register. For example, sign-extended array
2862  // indices can produce ridiculous increments like this:
2863  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2864  cost += NumVarIncrements;
2865 
2866  // Reusing variable increments likely saves a register to hold the multiple of
2867  // the stride.
2868  cost -= NumReusedIncrements;
2869 
2870  LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2871  << "\n");
2872 
2873  return cost < 0;
2874 }
2875 
2876 /// Add this IV user to an existing chain or make it the head of a new chain.
2877 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2878  SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2879  // When IVs are used as types of varying widths, they are generally converted
2880  // to a wider type with some uses remaining narrow under a (free) trunc.
2881  Value *const NextIV = getWideOperand(IVOper);
2882  const SCEV *const OperExpr = SE.getSCEV(NextIV);
2883  const SCEV *const OperExprBase = getExprBase(OperExpr);
2884 
2885  // Visit all existing chains. Check if its IVOper can be computed as a
2886  // profitable loop invariant increment from the last link in the Chain.
2887  unsigned ChainIdx = 0, NChains = IVChainVec.size();
2888  const SCEV *LastIncExpr = nullptr;
2889  for (; ChainIdx < NChains; ++ChainIdx) {
2890  IVChain &Chain = IVChainVec[ChainIdx];
2891 
2892  // Prune the solution space aggressively by checking that both IV operands
2893  // are expressions that operate on the same unscaled SCEVUnknown. This
2894  // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2895  // first avoids creating extra SCEV expressions.
2896  if (!StressIVChain && Chain.ExprBase != OperExprBase)
2897  continue;
2898 
2899  Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2900  if (!isCompatibleIVType(PrevIV, NextIV))
2901  continue;
2902 
2903  // A phi node terminates a chain.
2904  if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2905  continue;
2906 
2907  // The increment must be loop-invariant so it can be kept in a register.
2908  const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2909  const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2910  if (!SE.isLoopInvariant(IncExpr, L))
2911  continue;
2912 
2913  if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2914  LastIncExpr = IncExpr;
2915  break;
2916  }
2917  }
2918  // If we haven't found a chain, create a new one, unless we hit the max. Don't
2919  // bother for phi nodes, because they must be last in the chain.
2920  if (ChainIdx == NChains) {
2921  if (isa<PHINode>(UserInst))
2922  return;
2923  if (NChains >= MaxChains && !StressIVChain) {
2924  LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
2925  return;
2926  }
2927  LastIncExpr = OperExpr;
2928  // IVUsers may have skipped over sign/zero extensions. We don't currently
2929  // attempt to form chains involving extensions unless they can be hoisted
2930  // into this loop's AddRec.
2931  if (!isa<SCEVAddRecExpr>(LastIncExpr))
2932  return;
2933  ++NChains;
2934  IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
2935  OperExprBase));
2936  ChainUsersVec.resize(NChains);
2937  LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
2938  << ") IV=" << *LastIncExpr << "\n");
2939  } else {
2940  LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
2941  << ") IV+" << *LastIncExpr << "\n");
2942  // Add this IV user to the end of the chain.
2943  IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
2944  }
2945  IVChain &Chain = IVChainVec[ChainIdx];
2946 
2947  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
2948  // This chain's NearUsers become FarUsers.
2949  if (!LastIncExpr->isZero()) {
2950  ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
2951  NearUsers.end());
2952  NearUsers.clear();
2953  }
2954 
2955  // All other uses of IVOperand become near uses of the chain.
2956  // We currently ignore intermediate values within SCEV expressions, assuming
2957  // they will eventually be used be the current chain, or can be computed
2958  // from one of the chain increments. To be more precise we could
2959  // transitively follow its user and only add leaf IV users to the set.
2960  for (User *U : IVOper->users()) {
2961  Instruction *OtherUse = dyn_cast<Instruction>(U);
2962  if (!OtherUse)
2963  continue;
2964  // Uses in the chain will no longer be uses if the chain is formed.
2965  // Include the head of the chain in this iteration (not Chain.begin()).
2966  IVChain::const_iterator IncIter = Chain.Incs.begin();
2967  IVChain::const_iterator IncEnd = Chain.Incs.end();
2968  for( ; IncIter != IncEnd; ++IncIter) {
2969  if (IncIter->UserInst == OtherUse)
2970  break;
2971  }
2972  if (IncIter != IncEnd)
2973  continue;
2974 
2975  if (SE.isSCEVable(OtherUse->getType())
2976  && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
2977  && IU.isIVUserOrOperand(OtherUse)) {
2978  continue;
2979  }
2980  NearUsers.insert(OtherUse);
2981  }
2982 
2983  // Since this user is part of the chain, it's no longer considered a use
2984  // of the chain.
2985  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
2986 }
2987 
2988 /// Populate the vector of Chains.
2989 ///
2990 /// This decreases ILP at the architecture level. Targets with ample registers,
2991 /// multiple memory ports, and no register renaming probably don't want
2992 /// this. However, such targets should probably disable LSR altogether.
2993 ///
2994 /// The job of LSR is to make a reasonable choice of induction variables across
2995 /// the loop. Subsequent passes can easily "unchain" computation exposing more
2996 /// ILP *within the loop* if the target wants it.
2997 ///
2998 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
2999 /// will not reorder memory operations, it will recognize this as a chain, but
3000 /// will generate redundant IV increments. Ideally this would be corrected later
3001 /// by a smart scheduler:
3002 /// = A[i]
3003 /// = A[i+x]
3004 /// A[i] =
3005 /// A[i+x] =
3006 ///
3007 /// TODO: Walk the entire domtree within this loop, not just the path to the
3008 /// loop latch. This will discover chains on side paths, but requires
3009 /// maintaining multiple copies of the Chains state.
3010 void LSRInstance::CollectChains() {
3011  LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3012  SmallVector<ChainUsers, 8> ChainUsersVec;
3013 
3014  SmallVector<BasicBlock *,8> LatchPath;
3015  BasicBlock *LoopHeader = L->getHeader();
3016  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3017  Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3018  LatchPath.push_back(Rung->getBlock());
3019  }
3020  LatchPath.push_back(LoopHeader);
3021 
3022  // Walk the instruction stream from the loop header to the loop latch.
3023  for (BasicBlock *BB : reverse(LatchPath)) {
3024  for (Instruction &I : *BB) {
3025  // Skip instructions that weren't seen by IVUsers analysis.
3026  if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3027  continue;
3028 
3029  // Ignore users that are part of a SCEV expression. This way we only
3030  // consider leaf IV Users. This effectively rediscovers a portion of
3031  // IVUsers analysis but in program order this time.
3032  if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3033  continue;
3034 
3035  // Remove this instruction from any NearUsers set it may be in.
3036  for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3037  ChainIdx < NChains; ++ChainIdx) {
3038  ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3039  }
3040  // Search for operands that can be chained.
3041  SmallPtrSet<Instruction*, 4> UniqueOperands;
3042  User::op_iterator IVOpEnd = I.op_end();
3043  User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3044  while (IVOpIter != IVOpEnd) {
3045  Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3046  if (UniqueOperands.insert(IVOpInst).second)
3047  ChainInstruction(&I, IVOpInst, ChainUsersVec);
3048  IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3049  }
3050  } // Continue walking down the instructions.
3051  } // Continue walking down the domtree.
3052  // Visit phi backedges to determine if the chain can generate the IV postinc.
3053  for (PHINode &PN : L->getHeader()->phis()) {
3054  if (!SE.isSCEVable(PN.getType()))
3055  continue;
3056 
3057  Instruction *IncV =
3058  dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3059  if (IncV)
3060  ChainInstruction(&PN, IncV, ChainUsersVec);
3061  }
3062  // Remove any unprofitable chains.
3063  unsigned ChainIdx = 0;
3064  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3065  UsersIdx < NChains; ++UsersIdx) {
3066  if (!isProfitableChain(IVChainVec[UsersIdx],
3067  ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3068  continue;
3069  // Preserve the chain at UsesIdx.
3070  if (ChainIdx != UsersIdx)
3071  IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3072  FinalizeChain(IVChainVec[ChainIdx]);
3073  ++ChainIdx;
3074  }
3075  IVChainVec.resize(ChainIdx);
3076 }
3077 
3078 void LSRInstance::FinalizeChain(IVChain &Chain) {
3079  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3080  LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3081 
3082  for (const IVInc &Inc : Chain) {
3083  LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3084  auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3085  assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3086  IVIncSet.insert(UseI);
3087  }
3088 }
3089 
3090 /// Return true if the IVInc can be folded into an addressing mode.
3091 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3092  Value *Operand, const TargetTransformInfo &TTI) {
3093  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3094  if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
3095  return false;
3096 
3097  if (IncConst->getAPInt().getMinSignedBits() > 64)
3098  return false;
3099 
3100  MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3101  int64_t IncOffset = IncConst->getValue()->getSExtValue();
3102  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3103  IncOffset, /*HaseBaseReg=*/false))
3104  return false;
3105 
3106  return true;
3107 }
3108 
3109 /// Generate an add or subtract for each IVInc in a chain to materialize the IV
3110 /// user's operand from the previous IV user's operand.
3111 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
3112  SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3113  // Find the new IVOperand for the head of the chain. It may have been replaced
3114  // by LSR.
3115  const IVInc &Head = Chain.Incs[0];
3116  User::op_iterator IVOpEnd = Head.UserInst->op_end();
3117  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3118  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3119  IVOpEnd, L, SE);
3120  Value *IVSrc = nullptr;
3121  while (IVOpIter != IVOpEnd) {
3122  IVSrc = getWideOperand(*IVOpIter);
3123 
3124  // If this operand computes the expression that the chain needs, we may use
3125  // it. (Check this after setting IVSrc which is used below.)
3126  //
3127  // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3128  // narrow for the chain, so we can no longer use it. We do allow using a
3129  // wider phi, assuming the LSR checked for free truncation. In that case we
3130  // should already have a truncate on this operand such that
3131  // getSCEV(IVSrc) == IncExpr.
3132  if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3133  || SE.getSCEV(IVSrc) == Head.IncExpr) {
3134  break;
3135  }
3136  IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3137  }
3138  if (IVOpIter == IVOpEnd) {
3139  // Gracefully give up on this chain.
3140  LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3141  return;
3142  }
3143 
3144  LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3145  Type *IVTy = IVSrc->getType();
3146  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3147  const SCEV *LeftOverExpr = nullptr;
3148  for (const IVInc &Inc : Chain) {
3149  Instruction *InsertPt = Inc.UserInst;
3150  if (isa<PHINode>(InsertPt))
3151  InsertPt = L->getLoopLatch()->getTerminator();
3152 
3153  // IVOper will replace the current IV User's operand. IVSrc is the IV
3154  // value currently held in a register.
3155  Value *IVOper = IVSrc;
3156  if (!Inc.IncExpr->isZero()) {
3157  // IncExpr was the result of subtraction of two narrow values, so must
3158  // be signed.
3159  const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3160  LeftOverExpr = LeftOverExpr ?
3161  SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3162  }
3163  if (LeftOverExpr && !LeftOverExpr->isZero()) {
3164  // Expand the IV increment.
3165  Rewriter.clearPostInc();
3166  Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3167  const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3168  SE.getUnknown(IncV));
3169  IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3170 
3171  // If an IV increment can't be folded, use it as the next IV value.
3172  if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3173  assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3174  IVSrc = IVOper;
3175  LeftOverExpr = nullptr;
3176  }
3177  }
3178  Type *OperTy = Inc.IVOperand->getType();
3179  if (IVTy != OperTy) {
3180  assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3181  "cannot extend a chained IV");
3182  IRBuilder<> Builder(InsertPt);
3183  IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3184  }
3185  Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3186  DeadInsts.emplace_back(Inc.IVOperand);
3187  }
3188  // If LSR created a new, wider phi, we may also replace its postinc. We only
3189  // do this if we also found a wide value for the head of the chain.
3190  if (isa<PHINode>(Chain.tailUserInst())) {
3191  for (PHINode &Phi : L->getHeader()->phis()) {
3192  if (!isCompatibleIVType(&Phi, IVSrc))
3193  continue;
3194  Instruction *PostIncV = dyn_cast<Instruction>(
3195  Phi.getIncomingValueForBlock(L->getLoopLatch()));
3196  if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3197  continue;
3198  Value *IVOper = IVSrc;
3199  Type *PostIncTy = PostIncV->getType();
3200  if (IVTy != PostIncTy) {
3201  assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3202  IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3203  Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3204  IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3205  }
3206  Phi.replaceUsesOfWith(PostIncV, IVOper);
3207  DeadInsts.emplace_back(PostIncV);
3208  }
3209  }
3210 }
3211 
3212 void LSRInstance::CollectFixupsAndInitialFormulae() {
3213  for (const IVStrideUse &U : IU) {
3214  Instruction *UserInst = U.getUser();
3215  // Skip IV users that are part of profitable IV Chains.
3216  User::op_iterator UseI =
3217  find(UserInst->operands(), U.getOperandValToReplace());
3218  assert(UseI != UserInst->op_end() && "cannot find IV operand");
3219  if (IVIncSet.count(UseI)) {
3220  LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3221  continue;
3222  }
3223 
3224  LSRUse::KindType Kind = LSRUse::Basic;
3225  MemAccessTy AccessTy;
3226  if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3227  Kind = LSRUse::Address;
3228  AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3229  }
3230 
3231  const SCEV *S = IU.getExpr(U);
3232  PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3233 
3234  // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3235  // (N - i == 0), and this allows (N - i) to be the expression that we work
3236  // with rather than just N or i, so we can consider the register
3237  // requirements for both N and i at the same time. Limiting this code to
3238  // equality icmps is not a problem because all interesting loops use
3239  // equality icmps, thanks to IndVarSimplify.
3240  if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
3241  if (CI->isEquality()) {
3242  // Swap the operands if needed to put the OperandValToReplace on the
3243  // left, for consistency.
3244  Value *NV = CI->getOperand(1);
3245  if (NV == U.getOperandValToReplace()) {
3246  CI->setOperand(1, CI->getOperand(0));
3247  CI->setOperand(0, NV);
3248  NV = CI->getOperand(1);
3249  Changed = true;
3250  }
3251 
3252  // x == y --> x - y == 0
3253  const SCEV *N = SE.getSCEV(NV);
3254  if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
3255  // S is normalized, so normalize N before folding it into S
3256  // to keep the result normalized.
3257  N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3258  Kind = LSRUse::ICmpZero;
3259  S = SE.getMinusSCEV(N, S);
3260  }
3261 
3262  // -1 and the negations of all interesting strides (except the negation
3263  // of -1) are now also interesting.
3264  for (size_t i = 0, e = Factors.size(); i != e; ++i)
3265  if (Factors[i] != -1)
3266  Factors.insert(-(uint64_t)Factors[i]);
3267  Factors.insert(-1);
3268  }
3269 
3270  // Get or create an LSRUse.
3271  std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
3272  size_t LUIdx = P.first;
3273  int64_t Offset = P.second;
3274  LSRUse &LU = Uses[LUIdx];
3275 
3276  // Record the fixup.
3277  LSRFixup &LF = LU.getNewFixup();
3278  LF.UserInst = UserInst;
3279  LF.OperandValToReplace = U.getOperandValToReplace();
3280  LF.PostIncLoops = TmpPostIncLoops;
3281  LF.Offset = Offset;
3282  LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3283 
3284  if (!LU.WidestFixupType ||
3285  SE.getTypeSizeInBits(LU.WidestFixupType) <
3286  SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3287  LU.WidestFixupType = LF.OperandValToReplace->getType();
3288 
3289  // If this is the first use of this LSRUse, give it a formula.
3290  if (LU.Formulae.empty()) {
3291  InsertInitialFormula(S, LU, LUIdx);
3292  CountRegisters(LU.Formulae.back(), LUIdx);
3293  }
3294  }
3295 
3296  LLVM_DEBUG(print_fixups(dbgs()));
3297 }
3298 
3299 /// Insert a formula for the given expression into the given use, separating out
3300 /// loop-variant portions from loop-invariant and loop-computable portions.
3301 void
3302 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
3303  // Mark uses whose expressions cannot be expanded.
3304  if (!isSafeToExpand(S, SE))
3305  LU.RigidFormula = true;
3306 
3307  Formula F;
3308  F.initialMatch(S, L, SE);
3309  bool Inserted = InsertFormula(LU, LUIdx, F);
3310  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3311 }
3312 
3313 /// Insert a simple single-register formula for the given expression into the
3314 /// given use.
3315 void
3316 LSRInstance::InsertSupplementalFormula(const SCEV *S,
3317  LSRUse &LU, size_t LUIdx) {
3318  Formula F;
3319  F.BaseRegs.push_back(S);
3320  F.HasBaseReg = true;
3321  bool Inserted = InsertFormula(LU, LUIdx, F);
3322  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3323 }
3324 
3325 /// Note which registers are used by the given formula, updating RegUses.
3326 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3327  if (F.ScaledReg)
3328  RegUses.countRegister(F.ScaledReg, LUIdx);
3329  for (const SCEV *BaseReg : F.BaseRegs)
3330  RegUses.countRegister(BaseReg, LUIdx);
3331 }
3332 
3333 /// If the given formula has not yet been inserted, add it to the list, and
3334 /// return true. Return false otherwise.
3335 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3336  // Do not insert formula that we will not be able to expand.
3337  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3338  "Formula is illegal");
3339 
3340  if (!LU.InsertFormula(F, *L))
3341  return false;
3342 
3343  CountRegisters(F, LUIdx);
3344  return true;
3345 }
3346 
3347 /// Check for other uses of loop-invariant values which we're tracking. These
3348 /// other uses will pin these values in registers, making them less profitable
3349 /// for elimination.
3350 /// TODO: This currently misses non-constant addrec step registers.
3351 /// TODO: Should this give more weight to users inside the loop?
3352 void
3353 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3354  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3356 
3357  while (!Worklist.empty()) {
3358  const SCEV *S = Worklist.pop_back_val();
3359 
3360  // Don't process the same SCEV twice
3361  if (!Visited.insert(S).second)
3362  continue;
3363 
3364  if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3365  Worklist.append(N->op_begin(), N->op_end());
3366  else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
3367  Worklist.push_back(C->getOperand());
3368  else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3369  Worklist.push_back(D->getLHS());
3370  Worklist.push_back(D->getRHS());
3371  } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3372  const Value *V = US->getValue();
3373  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3374  // Look for instructions defined outside the loop.
3375  if (L->contains(Inst)) continue;
3376  } else if (isa<UndefValue>(V))
3377  // Undef doesn't have a live range, so it doesn't matter.
3378  continue;
3379  for (const Use &U : V->uses()) {
3380  const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3381  // Ignore non-instructions.
3382  if (!UserInst)
3383  continue;
3384  // Ignore instructions in other functions (as can happen with
3385  // Constants).
3386  if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3387  continue;
3388  // Ignore instructions not dominated by the loop.
3389  const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3390  UserInst->getParent() :
3391  cast<PHINode>(UserInst)->getIncomingBlock(
3392  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3393  if (!DT.dominates(L->getHeader(), UseBB))
3394  continue;
3395  // Don't bother if the instruction is in a BB which ends in an EHPad.
3396  if (UseBB->getTerminator()->isEHPad())
3397  continue;
3398  // Don't bother rewriting PHIs in catchswitch blocks.
3399  if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3400  continue;
3401  // Ignore uses which are part of other SCEV expressions, to avoid
3402  // analyzing them multiple times.
3403  if (SE.isSCEVable(UserInst->getType())) {
3404  const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3405  // If the user is a no-op, look through to its uses.
3406  if (!isa<SCEVUnknown>(UserS))
3407  continue;
3408  if (UserS == US) {
3409  Worklist.push_back(
3410  SE.getUnknown(const_cast<Instruction *>(UserInst)));
3411  continue;
3412  }
3413  }
3414  // Ignore icmp instructions which are already being analyzed.
3415  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3416  unsigned OtherIdx = !U.getOperandNo();
3417  Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3418  if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3419  continue;
3420  }
3421 
3422  std::pair<size_t, int64_t> P = getUse(
3423  S, LSRUse::Basic, MemAccessTy());
3424  size_t LUIdx = P.first;
3425  int64_t Offset = P.second;
3426  LSRUse &LU = Uses[LUIdx];
3427  LSRFixup &LF = LU.getNewFixup();
3428  LF.UserInst = const_cast<Instruction *>(UserInst);
3429  LF.OperandValToReplace = U;
3430  LF.Offset = Offset;
3431  LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3432  if (!LU.WidestFixupType ||
3433  SE.getTypeSizeInBits(LU.WidestFixupType) <
3434  SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3435  LU.WidestFixupType = LF.OperandValToReplace->getType();
3436  InsertSupplementalFormula(US, LU, LUIdx);
3437  CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3438  break;
3439  }
3440  }
3441  }
3442 }
3443 
3444 /// Split S into subexpressions which can be pulled out into separate
3445 /// registers. If C is non-null, multiply each subexpression by C.
3446 ///
3447 /// Return remainder expression after factoring the subexpressions captured by
3448 /// Ops. If Ops is complete, return NULL.
3449 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3451  const Loop *L,
3452  ScalarEvolution &SE,
3453  unsigned Depth = 0) {
3454  // Arbitrarily cap recursion to protect compile time.
3455  if (Depth >= 3)
3456  return S;
3457 
3458  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3459  // Break out add operands.
3460  for (const SCEV *S : Add->operands()) {
3461  const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3462  if (Remainder)
3463  Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3464  }
3465  return nullptr;
3466  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3467  // Split a non-zero base out of an addrec.
3468  if (AR->getStart()->isZero() || !AR->isAffine())
3469  return S;
3470 
3471  const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3472  C, Ops, L, SE, Depth+1);
3473  // Split the non-zero AddRec unless it is part of a nested recurrence that
3474  // does not pertain to this loop.
3475  if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3476  Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3477  Remainder = nullptr;
3478  }
3479  if (Remainder != AR->getStart()) {
3480  if (!Remainder)
3481  Remainder = SE.getConstant(AR->getType(), 0);
3482  return SE.getAddRecExpr(Remainder,
3483  AR->getStepRecurrence(SE),
3484  AR->getLoop(),
3485  //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3487  }
3488  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3489  // Break (C * (a + b + c)) into C*a + C*b + C*c.
3490  if (Mul->getNumOperands() != 2)
3491  return S;
3492  if (const SCEVConstant *Op0 =
3493  dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3494  C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3495  const SCEV *Remainder =
3496  CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3497  if (Remainder)
3498  Ops.push_back(SE.getMulExpr(C, Remainder));
3499  return nullptr;
3500  }
3501  }
3502  return S;
3503 }
3504 
3505 /// Return true if the SCEV represents a value that may end up as a
3506 /// post-increment operation.
3508  LSRUse &LU, const SCEV *S, const Loop *L,
3509  ScalarEvolution &SE) {
3510  if (LU.Kind != LSRUse::Address ||
3511  !LU.AccessTy.getType()->isIntOrIntVectorTy())
3512  return false;
3513  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3514  if (!AR)
3515  return false;
3516  const SCEV *LoopStep = AR->getStepRecurrence(SE);
3517  if (!isa<SCEVConstant>(LoopStep))
3518  return false;
3519  if (LU.AccessTy.getType()->getScalarSizeInBits() !=
3520  LoopStep->getType()->getScalarSizeInBits())
3521  return false;
3522  // Check if a post-indexed load/store can be used.
3523  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
3524  TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
3525  const SCEV *LoopStart = AR->getStart();
3526  if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3527  return true;
3528  }
3529  return false;
3530 }
3531 
3532 /// Helper function for LSRInstance::GenerateReassociations.
3533 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3534  const Formula &Base,
3535  unsigned Depth, size_t Idx,
3536  bool IsScaledReg) {
3537  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3538  // Don't generate reassociations for the base register of a value that
3539  // may generate a post-increment operator. The reason is that the
3540  // reassociations cause extra base+register formula to be created,
3541  // and possibly chosen, but the post-increment is more efficient.
3542  if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3543  return;
3545  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3546  if (Remainder)
3547  AddOps.push_back(Remainder);
3548 
3549  if (AddOps.size() == 1)
3550  return;
3551 
3553  JE = AddOps.end();
3554  J != JE; ++J) {
3555  // Loop-variant "unknown" values are uninteresting; we won't be able to
3556  // do anything meaningful with them.
3557  if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3558  continue;
3559 
3560  // Don't pull a constant into a register if the constant could be folded
3561  // into an immediate field.
3562  if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3563  LU.AccessTy, *J, Base.getNumRegs() > 1))
3564  continue;
3565 
3566  // Collect all operands except *J.
3567  SmallVector<const SCEV *, 8> InnerAddOps(
3568  ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3569  InnerAddOps.append(std::next(J),
3570  ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3571 
3572  // Don't leave just a constant behind in a register if the constant could
3573  // be folded into an immediate field.
3574  if (InnerAddOps.size() == 1 &&
3575  isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3576  LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3577  continue;
3578 
3579  const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3580  if (InnerSum->isZero())
3581  continue;
3582  Formula F = Base;
3583 
3584  // Add the remaining pieces of the add back into the new formula.
3585  const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3586  if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3587  TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3588  InnerSumSC->getValue()->getZExtValue())) {
3589  F.UnfoldedOffset =
3590  (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
3591  if (IsScaledReg)
3592  F.ScaledReg = nullptr;
3593  else
3594  F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3595  } else if (IsScaledReg)
3596  F.ScaledReg = InnerSum;
3597  else
3598  F.BaseRegs[Idx] = InnerSum;
3599 
3600  // Add J as its own register, or an unfolded immediate.
3601  const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3602  if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3603  TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3604  SC->getValue()->getZExtValue()))
3605  F.UnfoldedOffset =
3606  (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
3607  else
3608  F.BaseRegs.push_back(*J);
3609  // We may have changed the number of register in base regs, adjust the
3610  // formula accordingly.
3611  F.canonicalize(*L);
3612 
3613  if (InsertFormula(LU, LUIdx, F))
3614  // If that formula hadn't been seen before, recurse to find more like
3615  // it.
3616  // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3617  // Because just Depth is not enough to bound compile time.
3618  // This means that every time AddOps.size() is greater 16^x we will add
3619  // x to Depth.
3620  GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3621  Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3622  }
3623 }
3624 
3625 /// Split out subexpressions from adds and the bases of addrecs.
3626 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3627  Formula Base, unsigned Depth) {
3628  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3629  // Arbitrarily cap recursion to protect compile time.
3630  if (Depth >= 3)
3631  return;
3632 
3633  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3634  GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
3635 
3636  if (Base.Scale == 1)
3637  GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3638  /* Idx */ -1, /* IsScaledReg */ true);
3639 }
3640 
3641 /// Generate a formula consisting of all of the loop-dominating registers added
3642 /// into a single register.
3643 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3644  Formula Base) {
3645  // This method is only interesting on a plurality of registers.
3646  if (Base.BaseRegs.size() + (Base.Scale == 1) +
3647  (Base.UnfoldedOffset != 0) <= 1)
3648  return;
3649 
3650  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3651  // processing the formula.
3652  Base.unscale();
3654  Formula NewBase = Base;
3655  NewBase.BaseRegs.clear();
3656  Type *CombinedIntegerType = nullptr;
3657  for (const SCEV *BaseReg : Base.BaseRegs) {
3658  if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3659  !SE.hasComputableLoopEvolution(BaseReg, L)) {
3660  if (!CombinedIntegerType)
3661  CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
3662  Ops.push_back(BaseReg);
3663  }
3664  else
3665  NewBase.BaseRegs.push_back(BaseReg);
3666  }
3667 
3668  // If no register is relevant, we're done.
3669  if (Ops.size() == 0)
3670  return;
3671 
3672  // Utility function for generating the required variants of the combined
3673  // registers.
3674  auto GenerateFormula = [&](const SCEV *Sum) {
3675  Formula F = NewBase;
3676 
3677  // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3678  // opportunity to fold something. For now, just ignore such cases
3679  // rather than proceed with zero in a register.
3680  if (Sum->isZero())
3681  return;
3682 
3683  F.BaseRegs.push_back(Sum);
3684  F.canonicalize(*L);
3685  (void)InsertFormula(LU, LUIdx, F);
3686  };
3687 
3688  // If we collected at least two registers, generate a formula combining them.
3689  if (Ops.size() > 1) {
3690  SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
3691  GenerateFormula(SE.getAddExpr(OpsCopy));
3692  }
3693 
3694  // If we have an unfolded offset, generate a formula combining it with the
3695  // registers collected.
3696  if (NewBase.UnfoldedOffset) {
3697  assert(CombinedIntegerType && "Missing a type for the unfolded offset");
3698  Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
3699  true));
3700  NewBase.UnfoldedOffset = 0;
3701  GenerateFormula(SE.getAddExpr(Ops));
3702  }
3703 }
3704 
3705 /// Helper function for LSRInstance::GenerateSymbolicOffsets.
3706 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
3707  const Formula &Base, size_t Idx,
3708  bool IsScaledReg) {
3709  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3710  GlobalValue *GV = ExtractSymbol(G, SE);
3711  if (G->isZero() || !GV)
3712  return;
3713  Formula F = Base;
3714  F.BaseGV = GV;
3715  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3716  return;
3717  if (IsScaledReg)
3718  F.ScaledReg = G;
3719  else
3720  F.BaseRegs[Idx] = G;
3721  (void)InsertFormula(LU, LUIdx, F);
3722 }
3723 
3724 /// Generate reuse formulae using symbolic offsets.
3725 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3726  Formula Base) {
3727  // We can't add a symbolic offset if the address already contains one.
3728  if (Base.BaseGV) return;
3729 
3730  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3731  GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
3732  if (Base.Scale == 1)
3733  GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
3734  /* IsScaledReg */ true);
3735 }
3736 
3737 /// Helper function for LSRInstance::GenerateConstantOffsets.
3738 void LSRInstance::GenerateConstantOffsetsImpl(
3739  LSRUse &LU, unsigned LUIdx, const Formula &Base,
3740  const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
3741  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3742  for (int64_t Offset : Worklist) {
3743  Formula F = Base;
3744  F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
3745  if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
3746  LU.AccessTy, F)) {
3747  // Add the offset to the base register.
3748  const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
3749  // If it cancelled out, drop the base register, otherwise update it.
3750  if (NewG->isZero()) {
3751  if (IsScaledReg) {
3752  F.Scale = 0;
3753  F.ScaledReg = nullptr;
3754  } else
3755  F.deleteBaseReg(F.BaseRegs[Idx]);
3756  F.canonicalize(*L);
3757  } else if (IsScaledReg)
3758  F.ScaledReg = NewG;
3759  else
3760  F.BaseRegs[Idx] = NewG;
3761 
3762  (void)InsertFormula(LU, LUIdx, F);
3763  }
3764  }
3765 
3766  int64_t Imm = ExtractImmediate(G, SE);
3767  if (G->isZero() || Imm == 0)
3768  return;
3769  Formula F = Base;
3770  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3771  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3772  return;
3773  if (IsScaledReg)
3774  F.ScaledReg = G;
3775  else
3776  F.BaseRegs[Idx] = G;
3777  (void)InsertFormula(LU, LUIdx, F);
3778 }
3779 
3780 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3781 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3782  Formula Base) {
3783  // TODO: For now, just add the min and max offset, because it usually isn't
3784  // worthwhile looking at everything inbetween.
3785  SmallVector<int64_t, 2> Worklist;
3786  Worklist.push_back(LU.MinOffset);
3787  if (LU.MaxOffset != LU.MinOffset)
3788  Worklist.push_back(LU.MaxOffset);
3789 
3790  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3791  GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
3792  if (Base.Scale == 1)
3793  GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
3794  /* IsScaledReg */ true);
3795 }
3796 
3797 /// For ICmpZero, check to see if we can scale up the comparison. For example, x
3798 /// == y -> x*c == y*c.
3799 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3800  Formula Base) {
3801  if (LU.Kind != LSRUse::ICmpZero) return;
3802 
3803  // Determine the integer type for the base formula.
3804  Type *IntTy = Base.getType();
3805  if (!IntTy) return;
3806  if (SE.getTypeSizeInBits(IntTy) > 64) return;
3807 
3808  // Don't do this if there is more than one offset.
3809  if (LU.MinOffset != LU.MaxOffset) return;
3810 
3811  // Check if transformation is valid. It is illegal to multiply pointer.
3812  if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
3813  return;
3814  for (const SCEV *BaseReg : Base.BaseRegs)
3815  if (BaseReg->getType()->isPointerTy())
3816  return;
3817  assert(!Base.BaseGV && "ICmpZero use is not legal!");
3818 
3819  // Check each interesting stride.
3820  for (int64_t Factor : Factors) {
3821  // Check that the multiplication doesn't overflow.
3822  if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
3823  continue;
3824  int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
3825  if (NewBaseOffset / Factor != Base.BaseOffset)
3826  continue;
3827  // If the offset will be truncated at this use, check that it is in bounds.
3828  if (!IntTy->isPointerTy() &&
3829  !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
3830  continue;
3831 
3832  // Check that multiplying with the use offset doesn't overflow.
3833  int64_t Offset = LU.MinOffset;
3834  if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
3835  continue;
3836  Offset = (uint64_t)Offset * Factor;
3837  if (Offset / Factor != LU.MinOffset)
3838  continue;
3839  // If the offset will be truncated at this use, check that it is in bounds.
3840  if (!IntTy->isPointerTy() &&
3841  !ConstantInt::isValueValidForType(IntTy, Offset))
3842  continue;
3843 
3844  Formula F = Base;
3845  F.BaseOffset = NewBaseOffset;
3846 
3847  // Check that this scale is legal.
3848  if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
3849  continue;
3850 
3851  // Compensate for the use having MinOffset built into it.
3852  F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
3853 
3854  const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3855 
3856  // Check that multiplying with each base register doesn't overflow.
3857  for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
3858  F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
3859  if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
3860  goto next;
3861  }
3862 
3863  // Check that multiplying with the scaled register doesn't overflow.
3864  if (F.ScaledReg) {
3865  F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
3866  if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
3867  continue;
3868  }
3869 
3870  // Check that multiplying with the unfolded offset doesn't overflow.
3871  if (F.UnfoldedOffset != 0) {
3872  if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
3873  Factor == -1)
3874  continue;
3875  F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
3876  if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
3877  continue;
3878  // If the offset will be truncated, check that it is in bounds.
3879  if (!IntTy->isPointerTy() &&
3880  !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
3881  continue;
3882  }
3883 
3884  // If we make it here and it's legal, add it.
3885  (void)InsertFormula(LU, LUIdx, F);
3886  next:;
3887  }
3888 }
3889 
3890 /// Generate stride factor reuse formulae by making use of scaled-offset address
3891 /// modes, for example.
3892 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
3893  // Determine the integer type for the base formula.
3894  Type *IntTy = Base.getType();
3895  if (!IntTy) return;
3896 
3897  // If this Formula already has a scaled register, we can't add another one.
3898  // Try to unscale the formula to generate a better scale.
3899  if (Base.Scale != 0 && !Base.unscale())
3900  return;
3901 
3902  assert(Base.Scale == 0 && "unscale did not did its job!");
3903 
3904  // Check each interesting stride.
3905  for (int64_t Factor : Factors) {
3906  Base.Scale = Factor;
3907  Base.HasBaseReg = Base.BaseRegs.size() > 1;
3908  // Check whether this scale is going to be legal.
3909  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3910  Base)) {
3911  // As a special-case, handle special out-of-loop Basic users specially.
3912  // TODO: Reconsider this special case.
3913  if (LU.Kind == LSRUse::Basic &&
3914  isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
3915  LU.AccessTy, Base) &&
3916  LU.AllFixupsOutsideLoop)
3917  LU.Kind = LSRUse::Special;
3918  else
3919  continue;
3920  }
3921  // For an ICmpZero, negating a solitary base register won't lead to
3922  // new solutions.
3923  if (LU.Kind == LSRUse::ICmpZero &&
3924  !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
3925  continue;
3926  // For each addrec base reg, if its loop is current loop, apply the scale.
3927  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3928  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
3929  if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
3930  const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3931  if (FactorS->isZero())
3932  continue;
3933  // Divide out the factor, ignoring high bits, since we'll be
3934  // scaling the value back up in the end.
3935  if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
3936  // TODO: This could be optimized to avoid all the copying.
3937  Formula F = Base;
3938  F.ScaledReg = Quotient;
3939  F.deleteBaseReg(F.BaseRegs[i]);
3940  // The canonical representation of 1*reg is reg, which is already in
3941  // Base. In that case, do not try to insert the formula, it will be
3942  // rejected anyway.
3943  if (F.Scale == 1 && (F.BaseRegs.empty() ||
3944  (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
3945  continue;
3946  // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
3947  // non canonical Formula with ScaledReg's loop not being L.
3948  if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
3949  F.canonicalize(*L);
3950  (void)InsertFormula(LU, LUIdx, F);
3951  }
3952  }
3953  }
3954  }
3955 }
3956 
3957 /// Generate reuse formulae from different IV types.
3958 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
3959  // Don't bother truncating symbolic values.
3960  if (Base.BaseGV) return;
3961 
3962  // Determine the integer type for the base formula.
3963  Type *DstTy = Base.getType();
3964  if (!DstTy) return;
3965  DstTy = SE.getEffectiveSCEVType(DstTy);
3966 
3967  for (Type *SrcTy : Types) {
3968  if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
3969  Formula F = Base;
3970 
3971  if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
3972  for (const SCEV *&BaseReg : F.BaseRegs)
3973  BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
3974 
3975  // TODO: This assumes we've done basic processing on all uses and
3976  // have an idea what the register usage is.
3977  if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
3978  continue;
3979 
3980  F.canonicalize(*L);
3981  (void)InsertFormula(LU, LUIdx, F);
3982  }
3983  }
3984 }
3985 
3986 namespace {
3987 
3988 /// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
3989 /// modifications so that the search phase doesn't have to worry about the data
3990 /// structures moving underneath it.
3991 struct WorkItem {
3992  size_t LUIdx;
3993  int64_t Imm;
3994  const SCEV *OrigReg;
3995 
3996  WorkItem(size_t LI, int64_t I, const SCEV *R)
3997  : LUIdx(LI), Imm(I), OrigReg(R) {}
3998 
3999  void print(raw_ostream &OS) const;
4000  void dump() const;
4001 };
4002 
4003 } // end anonymous namespace
4004 
4005 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4006 void WorkItem::print(raw_ostream &OS) const {
4007  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4008  << " , add offset " << Imm;
4009 }
4010 
4011 LLVM_DUMP_METHOD void WorkItem::dump() const {
4012  print(errs()); errs() << '\n';
4013 }
4014 #endif
4015 
4016 /// Look for registers which are a constant distance apart and try to form reuse
4017 /// opportunities between them.
4018 void LSRInstance::GenerateCrossUseConstantOffsets() {
4019  // Group the registers by their value without any added constant offset.
4020  using ImmMapTy = std::map<int64_t, const SCEV *>;
4021 
4023  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4025  for (const SCEV *Use : RegUses) {
4026  const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4027  int64_t Imm = ExtractImmediate(Reg, SE);
4028  auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4029  if (Pair.second)
4030  Sequence.push_back(Reg);
4031  Pair.first->second.insert(std::make_pair(Imm, Use));
4032  UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4033  }
4034 
4035  // Now examine each set of registers with the same base value. Build up
4036  // a list of work to do and do the work in a separate step so that we're
4037  // not adding formulae and register counts while we're searching.
4038  SmallVector<WorkItem, 32> WorkItems;
4039  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
4040  for (const SCEV *Reg : Sequence) {
4041  const ImmMapTy &Imms = Map.find(Reg)->second;
4042 
4043  // It's not worthwhile looking for reuse if there's only one offset.
4044  if (Imms.size() == 1)
4045  continue;
4046 
4047  LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4048  for (const auto &Entry
4049  : Imms) dbgs()
4050  << ' ' << Entry.first;
4051  dbgs() << '\n');
4052 
4053  // Examine each offset.
4054  for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4055  J != JE; ++J) {
4056  const SCEV *OrigReg = J->second;
4057 
4058  int64_t JImm = J->first;
4059  const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4060 
4061  if (!isa<SCEVConstant>(OrigReg) &&
4062  UsedByIndicesMap[Reg].count() == 1) {
4063  LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4064  << '\n');
4065  continue;
4066  }
4067 
4068  // Conservatively examine offsets between this orig reg a few selected
4069  // other orig regs.
4070  ImmMapTy::const_iterator OtherImms[] = {
4071  Imms.begin(), std::prev(Imms.end()),
4072  Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
4073  2)
4074  };
4075  for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
4076  ImmMapTy::const_iterator M = OtherImms[i];
4077  if (M == J || M == JE) continue;
4078 
4079  // Compute the difference between the two.
4080  int64_t Imm = (uint64_t)JImm - M->first;
4081  for (unsigned LUIdx : UsedByIndices.set_bits())
4082  // Make a memo of this use, offset, and register tuple.
4083  if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4084  WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4085  }
4086  }
4087  }
4088 
4089  Map.clear();
4090  Sequence.clear();
4091  UsedByIndicesMap.clear();
4092  UniqueItems.clear();
4093 
4094  // Now iterate through the worklist and add new formulae.
4095  for (const WorkItem &WI : WorkItems) {
4096  size_t LUIdx = WI.LUIdx;
4097  LSRUse &LU = Uses[LUIdx];
4098  int64_t Imm = WI.Imm;
4099  const SCEV *OrigReg = WI.OrigReg;
4100 
4101  Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4102  const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
4103  unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4104 
4105  // TODO: Use a more targeted data structure.
4106  for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4107  Formula F = LU.Formulae[L];
4108  // FIXME: The code for the scaled and unscaled registers looks
4109  // very similar but slightly different. Investigate if they
4110  // could be merged. That way, we would not have to unscale the
4111  // Formula.
4112  F.unscale();
4113  // Use the immediate in the scaled register.
4114  if (F.ScaledReg == OrigReg) {
4115  int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
4116  // Don't create 50 + reg(-50).
4117  if (F.referencesReg(SE.getSCEV(
4118  ConstantInt::get(IntTy, -(uint64_t)Offset))))
4119  continue;
4120  Formula NewF = F;
4121  NewF.BaseOffset = Offset;
4122  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4123  NewF))
4124  continue;
4125  NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4126 
4127  // If the new scale is a constant in a register, and adding the constant
4128  // value to the immediate would produce a value closer to zero than the
4129  // immediate itself, then the formula isn't worthwhile.
4130  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
4131  if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
4132  (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4133  .ule(std::abs(NewF.BaseOffset)))
4134  continue;
4135 
4136  // OK, looks good.
4137  NewF.canonicalize(*this->L);
4138  (void)InsertFormula(LU, LUIdx, NewF);
4139  } else {
4140  // Use the immediate in a base register.
4141  for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4142  const SCEV *BaseReg = F.BaseRegs[N];
4143  if (BaseReg != OrigReg)
4144  continue;
4145  Formula NewF = F;
4146  NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
4147  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4148  LU.Kind, LU.AccessTy, NewF)) {
4149  if (TTI.shouldFavorPostInc() &&
4150  mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4151  continue;
4152  if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
4153  continue;
4154  NewF = F;
4155  NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
4156  }
4157  NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4158 
4159  // If the new formula has a constant in a register, and adding the
4160  // constant value to the immediate would produce a value closer to
4161  // zero than the immediate itself, then the formula isn't worthwhile.
4162  for (const SCEV *NewReg : NewF.BaseRegs)
4163  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
4164  if ((C->getAPInt() + NewF.BaseOffset)
4165  .abs()
4166  .slt(std::abs(NewF.BaseOffset)) &&
4167  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
4168  countTrailingZeros<uint64_t>(NewF.BaseOffset))
4169  goto skip_formula;
4170 
4171  // Ok, looks good.
4172  NewF.canonicalize(*this->L);
4173  (void)InsertFormula(LU, LUIdx, NewF);
4174  break;
4175  skip_formula:;
4176  }
4177  }
4178  }
4179  }
4180 }
4181 
4182 /// Generate formulae for each use.
4183 void
4184 LSRInstance::GenerateAllReuseFormulae() {
4185  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4186  // queries are more precise.
4187  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4188  LSRUse &LU = Uses[LUIdx];
4189  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4190  GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4191  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4192  GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4193  }
4194  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4195  LSRUse &LU = Uses[LUIdx];
4196  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4197  GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4198  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4199  GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4200  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4201  GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4202  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4203  GenerateScales(LU, LUIdx, LU.Formulae[i]);
4204  }
4205  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4206  LSRUse &LU = Uses[LUIdx];
4207  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4208  GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4209  }
4210 
4211  GenerateCrossUseConstantOffsets();
4212 
4213  LLVM_DEBUG(dbgs() << "\n"
4214  "After generating reuse formulae:\n";
4215  print_uses(dbgs()));
4216 }
4217 
4218 /// If there are multiple formulae with the same set of registers used
4219 /// by other uses, pick the best one and delete the others.
4220 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4221  DenseSet<const SCEV *> VisitedRegs;
4224 #ifndef NDEBUG
4225  bool ChangedFormulae = false;
4226 #endif
4227 
4228  // Collect the best formula for each unique set of shared registers. This
4229  // is reset for each use.
4230  using BestFormulaeTy =
4231  DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4232 
4233  BestFormulaeTy BestFormulae;
4234 
4235  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4236  LSRUse &LU = Uses[LUIdx];
4237  LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4238  dbgs() << '\n');
4239 
4240  bool Any = false;
4241  for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4242  FIdx != NumForms; ++FIdx) {
4243  Formula &F = LU.Formulae[FIdx];
4244 
4245  // Some formulas are instant losers. For example, they may depend on
4246  // nonexistent AddRecs from other loops. These need to be filtered
4247  // immediately, otherwise heuristics could choose them over others leading
4248  // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4249  // avoids the need to recompute this information across formulae using the
4250  // same bad AddRec. Passing LoserRegs is also essential unless we remove
4251  // the corresponding bad register from the Regs set.
4252  Cost CostF;
4253  Regs.clear();
4254  CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
4255  if (CostF.isLoser()) {
4256  // During initial formula generation, undesirable formulae are generated
4257  // by uses within other loops that have some non-trivial address mode or
4258  // use the postinc form of the IV. LSR needs to provide these formulae
4259  // as the basis of rediscovering the desired formula that uses an AddRec
4260  // corresponding to the existing phi. Once all formulae have been
4261  // generated, these initial losers may be pruned.
4262  LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4263  dbgs() << "\n");
4264  }
4265  else {
4267  for (const SCEV *Reg : F.BaseRegs) {
4268  if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4269  Key.push_back(Reg);
4270  }
4271  if (F.ScaledReg &&
4272  RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4273  Key.push_back(F.ScaledReg);
4274  // Unstable sort by host order ok, because this is only used for
4275  // uniquifying.
4276  llvm::sort(Key);
4277 
4278  std::pair<BestFormulaeTy::const_iterator, bool> P =
4279  BestFormulae.insert(std::make_pair(Key, FIdx));
4280  if (P.second)
4281  continue;
4282 
4283  Formula &Best = LU.Formulae[P.first->second];
4284 
4285  Cost CostBest;
4286  Regs.clear();
4287  CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
4288  if (CostF.isLess(CostBest, TTI))
4289  std::swap(F, Best);
4290  LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4291  dbgs() << "\n"
4292  " in favor of formula ";
4293  Best.print(dbgs()); dbgs() << '\n');
4294  }
4295 #ifndef NDEBUG
4296  ChangedFormulae = true;
4297 #endif
4298  LU.DeleteFormula(F);
4299  --FIdx;
4300  --NumForms;
4301  Any = true;
4302  }
4303 
4304  // Now that we've filtered out some formulae, recompute the Regs set.
4305  if (Any)
4306  LU.RecomputeRegs(LUIdx, RegUses);
4307 
4308  // Reset this to prepare for the next use.
4309  BestFormulae.clear();
4310  }
4311 
4312  LLVM_DEBUG(if (ChangedFormulae) {
4313  dbgs() << "\n"
4314  "After filtering out undesirable candidates:\n";
4315  print_uses(dbgs());
4316  });
4317 }
4318 
4319 /// Estimate the worst-case number of solutions the solver might have to
4320 /// consider. It almost never considers this many solutions because it prune the
4321 /// search space, but the pruning isn't always sufficient.
4322 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4323  size_t Power = 1;
4324  for (const LSRUse &LU : Uses) {
4325  size_t FSize = LU.Formulae.size();
4326  if (FSize >= ComplexityLimit) {
4327  Power = ComplexityLimit;
4328  break;
4329  }
4330  Power *= FSize;
4331  if (Power >= ComplexityLimit)
4332  break;
4333  }
4334  return Power;
4335 }
4336 
4337 /// When one formula uses a superset of the registers of another formula, it
4338 /// won't help reduce register pressure (though it may not necessarily hurt
4339 /// register pressure); remove it to simplify the system.
4340 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4341  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4342  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4343 
4344  LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4345  "which use a superset of registers used by other "
4346  "formulae.\n");
4347 
4348  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4349  LSRUse &LU = Uses[LUIdx];
4350  bool Any = false;
4351  for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4352  Formula &F = LU.Formulae[i];
4353  // Look for a formula with a constant or GV in a register. If the use
4354  // also has a formula with that same value in an immediate field,
4355  // delete the one that uses a register.
4357  I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4358  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4359  Formula NewF = F;
4360  NewF.BaseOffset += C->getValue()->getSExtValue();
4361  NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4362  (I - F.BaseRegs.begin()));
4363  if (LU.HasFormulaWithSameRegs(NewF)) {
4364  LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4365  dbgs() << '\n');
4366  LU.DeleteFormula(F);
4367  --i;
4368  --e;
4369  Any = true;
4370  break;
4371  }
4372  } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4373  if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4374  if (!F.BaseGV) {
4375  Formula NewF = F;
4376  NewF.BaseGV = GV;
4377  NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4378  (I - F.BaseRegs.begin()));
4379  if (LU.HasFormulaWithSameRegs(NewF)) {
4380  LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4381  dbgs() << '\n');
4382  LU.DeleteFormula(F);
4383  --i;
4384  --e;
4385  Any = true;
4386  break;
4387  }
4388  }
4389  }
4390  }
4391  }
4392  if (Any)
4393  LU.RecomputeRegs(LUIdx, RegUses);
4394  }
4395 
4396  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4397  }
4398 }
4399 
4400 /// When there are many registers for expressions like A, A+1, A+2, etc.,
4401 /// allocate a single register for them.
4402 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4403  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4404  return;
4405 
4406  LLVM_DEBUG(
4407  dbgs() << "The search space is too complex.\n"
4408  "Narrowing the search space by assuming that uses separated "
4409  "by a constant offset will use the same registers.\n");
4410 
4411  // This is especially useful for unrolled loops.
4412 
4413  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4414  LSRUse &LU = Uses[LUIdx];
4415  for (const Formula &F : LU.Formulae) {
4416  if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
4417  continue;
4418 
4419  LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4420  if (!LUThatHas)
4421  continue;
4422 
4423  if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4424  LU.Kind, LU.AccessTy))
4425  continue;
4426 
4427  LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4428 
4429  LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4430 
4431  // Transfer the fixups of LU to LUThatHas.
4432  for (LSRFixup &Fixup : LU.Fixups) {
4433  Fixup.Offset += F.BaseOffset;
4434  LUThatHas->pushFixup(Fixup);
4435  LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4436  }
4437 
4438  // Delete formulae from the new use which are no longer legal.
4439  bool Any = false;
4440  for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4441  Formula &F = LUThatHas->Formulae[i];
4442  if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4443  LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4444  LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4445  LUThatHas->DeleteFormula(F);
4446  --i;
4447  --e;
4448  Any = true;
4449  }
4450  }
4451 
4452  if (Any)
4453  LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4454 
4455  // Delete the old use.
4456  DeleteUse(LU, LUIdx);
4457  --LUIdx;
4458  --NumUses;
4459  break;
4460  }
4461  }
4462 
4463  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4464 }
4465 
4466 /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4467 /// we've done more filtering, as it may be able to find more formulae to
4468 /// eliminate.
4469 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4470  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4471  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4472 
4473  LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4474  "undesirable dedicated registers.\n");
4475 
4476  FilterOutUndesirableDedicatedRegisters();
4477 
4478  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4479  }
4480 }
4481 
4482 /// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4483 /// Pick the best one and delete the others.
4484 /// This narrowing heuristic is to keep as many formulae with different
4485 /// Scale and ScaledReg pair as possible while narrowing the search space.
4486 /// The benefit is that it is more likely to find out a better solution
4487 /// from a formulae set with more Scale and ScaledReg variations than
4488 /// a formulae set with the same Scale and ScaledReg. The picking winner
4489 /// reg heuristic will often keep the formulae with the same Scale and
4490 /// ScaledReg and filter others, and we want to avoid that if possible.
4491 void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
4492  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4493  return;
4494 
4495  LLVM_DEBUG(
4496  dbgs() << "The search space is too complex.\n"
4497  "Narrowing the search space by choosing the best Formula "
4498  "from the Formulae with the same Scale and ScaledReg.\n");
4499 
4500  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
4501  using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
4502 
4503  BestFormulaeTy BestFormulae;
4504 #ifndef NDEBUG
4505  bool ChangedFormulae = false;
4506 #endif
4507  DenseSet<const SCEV *> VisitedRegs;
4509 
4510  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4511  LSRUse &LU = Uses[LUIdx];
4512  LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4513  dbgs() << '\n');
4514 
4515  // Return true if Formula FA is better than Formula FB.
4516  auto IsBetterThan = [&](Formula &FA, Formula &FB) {
4517  // First we will try to choose the Formula with fewer new registers.
4518  // For a register used by current Formula, the more the register is
4519  // shared among LSRUses, the less we increase the register number
4520  // counter of the formula.
4521  size_t FARegNum = 0;
4522  for (const SCEV *Reg : FA.BaseRegs) {
4523  const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4524  FARegNum += (NumUses - UsedByIndices.count() + 1);
4525  }
4526  size_t FBRegNum = 0;
4527  for (const SCEV *Reg : FB.BaseRegs) {
4528  const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4529  FBRegNum += (NumUses - UsedByIndices.count() + 1);
4530  }
4531  if (FARegNum != FBRegNum)
4532  return FARegNum < FBRegNum;
4533 
4534  // If the new register numbers are the same, choose the Formula with
4535  // less Cost.
4536  Cost CostFA, CostFB;
4537  Regs.clear();
4538  CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
4539  Regs.clear();
4540  CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
4541  return CostFA.isLess(CostFB, TTI);
4542  };
4543 
4544  bool Any = false;
4545  for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
4546  ++FIdx) {
4547  Formula &F = LU.Formulae[FIdx];
4548  if (!F.ScaledReg)
4549  continue;
4550  auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
4551  if (P.second)
4552  continue;
4553 
4554  Formula &Best = LU.Formulae[P.first->second];
4555  if (IsBetterThan(F, Best))
4556  std::swap(F, Best);
4557  LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4558  dbgs() << "\n"
4559  " in favor of formula ";
4560  Best.print(dbgs()); dbgs() << '\n');
4561 #ifndef NDEBUG
4562  ChangedFormulae = true;
4563 #endif
4564  LU.DeleteFormula(F);
4565  --FIdx;
4566  --NumForms;
4567  Any = true;
4568  }
4569  if (Any)
4570  LU.RecomputeRegs(LUIdx, RegUses);
4571 
4572  // Reset this to prepare for the next use.
4573  BestFormulae.clear();
4574  }
4575 
4576  LLVM_DEBUG(if (ChangedFormulae) {
4577  dbgs() << "\n"
4578  "After filtering out undesirable candidates:\n";
4579  print_uses(dbgs());
4580  });
4581 }
4582 
4583 /// The function delete formulas with high registers number expectation.
4584 /// Assuming we don't know the value of each formula (already delete
4585 /// all inefficient), generate probability of not selecting for each
4586 /// register.
4587 /// For example,
4588 /// Use1:
4589 /// reg(a) + reg({0,+,1})
4590 /// reg(a) + reg({-1,+,1}) + 1
4591 /// reg({a,+,1})
4592 /// Use2:
4593 /// reg(b) + reg({0,+,1})
4594 /// reg(b) + reg({-1,+,1}) + 1
4595 /// reg({b,+,1})
4596 /// Use3:
4597 /// reg(c) + reg(b) + reg({0,+,1})
4598 /// reg(c) + reg({b,+,1})
4599 ///
4600 /// Probability of not selecting
4601 /// Use1 Use2 Use3
4602 /// reg(a) (1/3) * 1 * 1
4603 /// reg(b) 1 * (1/3) * (1/2)
4604 /// reg({0,+,1}) (2/3) * (2/3) * (1/2)
4605 /// reg({-1,+,1}) (2/3) * (2/3) * 1
4606 /// reg({a,+,1}) (2/3) * 1 * 1
4607 /// reg({b,+,1}) 1 * (2/3) * (2/3)
4608 /// reg(c) 1 * 1 * 0
4609 ///
4610 /// Now count registers number mathematical expectation for each formula:
4611 /// Note that for each use we exclude probability if not selecting for the use.
4612 /// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
4613 /// probabilty 1/3 of not selecting for Use1).
4614 /// Use1:
4615 /// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
4616 /// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
4617 /// reg({a,+,1}) 1
4618 /// Use2:
4619 /// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
4620 /// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
4621 /// reg({b,+,1}) 2/3
4622 /// Use3:
4623 /// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
4624 /// reg(c) + reg({b,+,1}) 1 + 2/3
4625 void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
4626  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4627  return;
4628  // Ok, we have too many of formulae on our hands to conveniently handle.
4629  // Use a rough heuristic to thin out the list.
4630 
4631  // Set of Regs wich will be 100% used in final solution.
4632  // Used in each formula of a solution (in example above this is reg(c)).
4633  // We can skip them in calculations.
4635  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4636 
4637  // Map each register to probability of not selecting
4639  for (const SCEV *Reg : RegUses) {
4640  if (UniqRegs.count(Reg))
4641  continue;
4642  float PNotSel = 1;
4643  for (const LSRUse &LU : Uses) {
4644  if (!LU.Regs.count(Reg))
4645  continue;
4646  float P = LU.getNotSelectedProbability(Reg);
4647  if (P != 0.0)
4648  PNotSel *= P;
4649  else
4650  UniqRegs.insert(Reg);
4651  }
4652  RegNumMap.insert(std::make_pair(Reg, PNotSel));
4653  }
4654 
4655  LLVM_DEBUG(
4656  dbgs() << "Narrowing the search space by deleting costly formulas\n");
4657 
4658  // Delete formulas where registers number expectation is high.
4659  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4660  LSRUse &LU = Uses[LUIdx];
4661  // If nothing to delete - continue.
4662  if (LU.Formulae.size() < 2)
4663  continue;
4664  // This is temporary solution to test performance. Float should be
4665  // replaced with round independent type (based on integers) to avoid
4666  // different results for different target builds.
4667  float FMinRegNum = LU.Formulae[0].getNumRegs();
4668  float FMinARegNum = LU.Formulae[0].getNumRegs();
4669  size_t MinIdx = 0;
4670  for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4671  Formula &F = LU.Formulae[i];
4672  float FRegNum = 0;
4673  float FARegNum = 0;
4674  for (const SCEV *BaseReg : F.BaseRegs) {
4675  if (UniqRegs.count(BaseReg))
4676  continue;
4677  FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
4678  if (isa<SCEVAddRecExpr>(BaseReg))
4679  FARegNum +=
4680  RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
4681  }
4682  if (const SCEV *ScaledReg = F.ScaledReg) {
4683  if (!UniqRegs.count(ScaledReg)) {
4684  FRegNum +=
4685  RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
4686  if (isa<SCEVAddRecExpr>(ScaledReg))
4687  FARegNum +=
4688  RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
4689  }
4690  }
4691  if (FMinRegNum > FRegNum ||
4692  (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
4693  FMinRegNum = FRegNum;
4694  FMinARegNum = FARegNum;
4695  MinIdx = i;
4696  }
4697  }
4698  LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
4699  dbgs() << " with min reg num " << FMinRegNum << '\n');
4700  if (MinIdx != 0)
4701  std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
4702  while (LU.Formulae.size() != 1) {
4703  LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
4704  dbgs() << '\n');
4705  LU.Formulae.pop_back();
4706  }
4707  LU.RecomputeRegs(LUIdx, RegUses);
4708  assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
4709  Formula &F = LU.Formulae[0];
4710  LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
4711  // When we choose the formula, the regs become unique.
4712  UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
4713  if (F.ScaledReg)
4714  UniqRegs.insert(F.ScaledReg);
4715  }
4716  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4717 }
4718 
4719 /// Pick a register which seems likely to be profitable, and then in any use
4720 /// which has any reference to that register, delete all formulae which do not
4721 /// reference that register.
4722 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
4723  // With all other options exhausted, loop until the system is simple
4724  // enough to handle.
4726  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4727  // Ok, we have too many of formulae on our hands to conveniently handle.
4728  // Use a rough heuristic to thin out the list.
4729  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4730 
4731  // Pick the register which is used by the most LSRUses, which is likely
4732  // to be a good reuse register candidate.
4733  const SCEV *Best = nullptr;
4734  unsigned BestNum = 0;
4735  for (const SCEV *Reg : RegUses) {
4736  if (Taken.count(Reg))
4737  continue;
4738  if (!Best) {
4739  Best = Reg;
4740  BestNum = RegUses.getUsedByIndices(Reg).count();
4741  } else {
4742  unsigned Count = RegUses.getUsedByIndices(Reg).count();
4743  if (Count > BestNum) {
4744  Best = Reg;
4745  BestNum = Count;
4746  }
4747  }
4748  }
4749 
4750  LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
4751  << " will yield profitable reuse.\n");
4752  Taken.insert(Best);
4753 
4754  // In any use with formulae which references this register, delete formulae
4755  // which don't reference it.
4756  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4757  LSRUse &LU = Uses[LUIdx];
4758  if (!LU.Regs.count(Best)) continue;
4759 
4760  bool Any = false;
4761  for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4762  Formula &F = LU.Formulae[i];
4763  if (!F.referencesReg(Best)) {
4764  LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4765  LU.DeleteFormula(F);
4766  --e;
4767  --i;
4768  Any = true;
4769  assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
4770  continue;
4771  }
4772  }
4773 
4774  if (Any)
4775  LU.RecomputeRegs(LUIdx, RegUses);
4776  }
4777 
4778  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4779  }
4780 }
4781 
4782 /// If there are an extraordinary number of formulae to choose from, use some
4783 /// rough heuristics to prune down the number of formulae. This keeps the main
4784 /// solver from taking an extraordinary amount of time in some worst-case
4785 /// scenarios.
4786 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
4787  NarrowSearchSpaceByDetectingSupersets();
4788  NarrowSearchSpaceByCollapsingUnrolledCode();
4789  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
4790  if (FilterSameScaledReg)
4791  NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
4792  if (LSRExpNarrow)
4793  NarrowSearchSpaceByDeletingCostlyFormulas();
4794  else
4795  NarrowSearchSpaceByPickingWinnerRegs();
4796 }
4797 
4798 /// This is the recursive solver.
4799 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
4800  Cost &SolutionCost,
4802  const Cost &CurCost,
4803  const SmallPtrSet<const SCEV *, 16> &CurRegs,
4804  DenseSet<const SCEV *> &VisitedRegs) const {
4805  // Some ideas:
4806  // - prune more:
4807  // - use more aggressive filtering
4808  // - sort the formula so that the most profitable solutions are found first
4809  // - sort the uses too
4810  // - search faster:
4811  // - don't compute a cost, and then compare. compare while computing a cost
4812  // and bail early.
4813  // - track register sets with SmallBitVector
4814 
4815  const LSRUse &LU = Uses[Workspace.size()];
4816 
4817  // If this use references any register that's already a part of the
4818  // in-progress solution, consider it a requirement that a formula must
4819  // reference that register in order to be considered. This prunes out
4820  // unprofitable searching.
4822  for (const SCEV *S : CurRegs)
4823  if (LU.Regs.count(S))
4824  ReqRegs.insert(S);
4825 
4827  Cost NewCost;
4828  for (const Formula &F : LU.Formulae) {
4829  // Ignore formulae which may not be ideal in terms of register reuse of
4830  // ReqRegs. The formula should use all required registers before
4831  // introducing new ones.
4832  int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
4833  for (const SCEV *Reg : ReqRegs) {
4834  if ((F.ScaledReg && F.ScaledReg == Reg) ||
4835  is_contained(F.BaseRegs, Reg)) {
4836  --NumReqRegsToFind;
4837  if (NumReqRegsToFind == 0)
4838  break;
4839  }
4840  }
4841  if (NumReqRegsToFind != 0) {
4842  // If none of the formulae satisfied the required registers, then we could
4843  // clear ReqRegs and try again. Currently, we simply give up in this case.
4844  continue;
4845  }
4846 
4847  // Evaluate the cost of the current formula. If it's already worse than
4848  // the current best, prune the search at that point.
4849  NewCost = CurCost;
4850  NewRegs = CurRegs;
4851  NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
4852  if (NewCost.isLess(SolutionCost, TTI)) {
4853  Workspace.push_back(&F);
4854  if (Workspace.size() != Uses.size()) {
4855  SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
4856  NewRegs, VisitedRegs);
4857  if (F.getNumRegs() == 1 && Workspace.size() == 1)
4858  VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
4859  } else {
4860  LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
4861  dbgs() << ".\n Regs:"; for (const SCEV *S
4862  : NewRegs) dbgs()
4863  << ' ' << *S;
4864  dbgs() << '\n');
4865 
4866  SolutionCost = NewCost;
4867  Solution = Workspace;
4868  }
4869  Workspace.pop_back();
4870  }
4871  }
4872 }
4873 
4874 /// Choose one formula from each use. Return the results in the given Solution
4875 /// vector.
4876 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
4878  Cost SolutionCost;
4879  SolutionCost.Lose();
4880  Cost CurCost;
4882  DenseSet<const SCEV *> VisitedRegs;
4883  Workspace.reserve(Uses.size());
4884 
4885  // SolveRecurse does all the work.
4886  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
4887  CurRegs, VisitedRegs);
4888  if (Solution.empty()) {
4889  LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
4890  return;
4891  }
4892 
4893  // Ok, we've now made all our decisions.
4894  LLVM_DEBUG(dbgs() << "\n"
4895  "The chosen solution requires ";
4896  SolutionCost.print(dbgs()); dbgs() << ":\n";
4897  for (size_t i = 0, e = Uses.size(); i != e; ++i) {
4898  dbgs() << " ";
4899  Uses[i].print(dbgs());
4900  dbgs() << "\n"
4901  " ";
4902  Solution[i]->print(dbgs());
4903  dbgs() << '\n';
4904  });
4905 
4906  assert(Solution.size() == Uses.size() && "Malformed solution!");
4907 }
4908 
4909 /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
4910 /// we can go while still being dominated by the input positions. This helps
4911 /// canonicalize the insert position, which encourages sharing.
4913 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
4914  const SmallVectorImpl<Instruction *> &Inputs)
4915  const {
4916  Instruction *Tentative = &*IP;
4917  while (true) {
4918  bool AllDominate = true;
4919  Instruction *BetterPos = nullptr;
4920  // Don't bother attempting to insert before a catchswitch, their basic block
4921  // cannot have other non-PHI instructions.
4922  if (isa<CatchSwitchInst>(Tentative))
4923  return IP;
4924 
4925  for (Instruction *Inst : Inputs) {
4926  if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
4927  AllDominate = false;
4928  break;
4929  }
4930  // Attempt to find an insert position in the middle of the block,
4931  // instead of at the end, so that it can be used for other expansions.
4932  if (Tentative->getParent() == Inst->getParent() &&
4933  (!BetterPos || !DT.dominates(Inst, BetterPos)))
4934  BetterPos = &*std::next(BasicBlock::iterator(Inst));
4935  }
4936  if (!AllDominate)
4937  break;
4938  if (BetterPos)
4939  IP = BetterPos->getIterator();
4940  else
4941  IP = Tentative->getIterator();
4942 
4943  const Loop *IPLoop = LI.getLoopFor(IP->getParent());
4944  unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
4945 
4946  BasicBlock *IDom;
4947  for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
4948  if (!Rung) return IP;
4949  Rung = Rung->getIDom();
4950  if (!Rung) return IP;
4951  IDom = Rung->getBlock();
4952 
4953  // Don't climb into a loop though.
4954  const Loop *IDomLoop = LI.getLoopFor(IDom);
4955  unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
4956  if (IDomDepth <= IPLoopDepth &&
4957  (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
4958  break;
4959  }
4960 
4961  Tentative = IDom->getTerminator();
4962  }
4963 
4964  return IP;
4965 }
4966 
4967 /// Determine an input position which will be dominated by the operands and
4968 /// which will dominate the result.
4970 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
4971  const LSRFixup &LF,
4972  const LSRUse &LU,
4973  SCEVExpander &Rewriter) const {
4974  // Collect some instructions which must be dominated by the
4975  // expanding replacement. These must be dominated by any operands that
4976  // will be required in the expansion.
4978  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
4979  Inputs.push_back(I);
4980  if (LU.Kind == LSRUse::ICmpZero)
4981  if (Instruction *I =
4982  dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
4983  Inputs.push_back(I);
4984  if (LF.PostIncLoops.count(L)) {
4985  if (LF.isUseFullyOutsideLoop(L))
4986  Inputs.push_back(L->getLoopLatch()->getTerminator());
4987  else
4988  Inputs.push_back(IVIncInsertPos);
4989  }
4990  // The expansion must also be dominated by the increment positions of any
4991  // loops it for which it is using post-inc mode.
4992  for (const Loop *PIL : LF.PostIncLoops) {
4993  if (PIL == L) continue;
4994 
4995  // Be dominated by the loop exit.
4996  SmallVector<BasicBlock *, 4> ExitingBlocks;
4997  PIL->getExitingBlocks(ExitingBlocks);
4998  if (!ExitingBlocks.empty()) {
4999  BasicBlock *BB = ExitingBlocks[0];
5000  for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5001  BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5002  Inputs.push_back(BB->getTerminator());
5003  }
5004  }
5005 
5006  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5007  && !isa<DbgInfoIntrinsic>(LowestIP) &&
5008  "Insertion point must be a normal instruction");
5009 
5010  // Then, climb up the immediate dominator tree as far as we can go while
5011  // still being dominated by the input positions.
5012  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5013 
5014  // Don't insert instructions before PHI nodes.
5015  while (isa<PHINode>(IP)) ++IP;
5016 
5017  // Ignore landingpad instructions.
5018  while (IP->isEHPad()) ++IP;
5019 
5020  // Ignore debug intrinsics.
5021  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5022 
5023  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5024  // IP consistent across expansions and allows the previously inserted
5025  // instructions to be reused by subsequent expansion.
5026  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5027  ++IP;
5028 
5029  return IP;
5030 }
5031 
5032 /// Emit instructions for the leading candidate expression for this LSRUse (this
5033 /// is called "expanding").
5034 Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5035  const Formula &F, BasicBlock::iterator IP,
5036  SCEVExpander &Rewriter,
5037  SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5038  if (LU.RigidFormula)
5039  return LF.OperandValToReplace;
5040 
5041  // Determine an input position which will be dominated by the operands and
5042  // which will dominate the result.
5043  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
5044  Rewriter.setInsertPoint(&*IP);
5045 
5046  // Inform the Rewriter if we have a post-increment use, so that it can
5047  // perform an advantageous expansion.
5048  Rewriter.setPostInc(LF.PostIncLoops);
5049 
5050  // This is the type that the user actually needs.
5051  Type *OpTy = LF.OperandValToReplace->getType();
5052  // This will be the type that we'll initially expand to.
5053  Type *Ty = F.getType();
5054  if (!Ty)
5055  // No type known; just expand directly to the ultimate type.
5056  Ty = OpTy;
5057  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5058  // Expand directly to the ultimate type if it's the right size.
5059  Ty = OpTy;
5060  // This is the type to do integer arithmetic in.
5061  Type *IntTy = SE.getEffectiveSCEVType(Ty);
5062 
5063  // Build up a list of operands to add together to form the full base.
5065 
5066  // Expand the BaseRegs portion.
5067  for (const SCEV *Reg : F.BaseRegs) {
5068  assert(!Reg->isZero() && "Zero allocated in a base register!");
5069 
5070  // If we're expanding for a post-inc user, make the post-inc adjustment.
5071  Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5072  Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5073  }
5074 
5075  // Expand the ScaledReg portion.
5076  Value *ICmpScaledV = nullptr;
5077  if (F.Scale != 0) {
5078  const SCEV *ScaledS = F.ScaledReg;
5079 
5080  // If we're expanding for a post-inc user, make the post-inc adjustment.
5081  PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5082  ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5083 
5084  if (LU.Kind == LSRUse::ICmpZero) {
5085  // Expand ScaleReg as if it was part of the base regs.
5086  if (F.Scale == 1)
5087  Ops.push_back(
5088  SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5089  else {
5090  // An interesting way of "folding" with an icmp is to use a negated
5091  // scale, which we'll implement by inserting it into the other operand
5092  // of the icmp.
5093  assert(F.Scale == -1 &&
5094  "The only scale supported by ICmpZero uses is -1!");
5095  ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5096  }
5097  } else {
5098  // Otherwise just expand the scaled register and an explicit scale,
5099  // which is expected to be matched as part of the address.
5100 
5101  // Flush the operand list to suppress SCEVExpander hoisting address modes.
5102  // Unless the addressing mode will not be folded.
5103  if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5104  isAMCompletelyFolded(TTI, LU, F)) {
5105  Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5106  Ops.clear();
5107  Ops.push_back(SE.getUnknown(FullV));
5108  }
5109  ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5110  if (F.Scale != 1)
5111  ScaledS =
5112  SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5113  Ops.push_back(ScaledS);
5114  }
5115  }
5116 
5117  // Expand the GV portion.
5118  if (F.BaseGV) {
5119  // Flush the operand list to suppress SCEVExpander hoisting.
5120  if (!Ops.empty()) {
5121  Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5122  Ops.clear();
5123  Ops.push_back(SE.getUnknown(FullV));
5124  }
5125  Ops.push_back(SE.getUnknown(F.BaseGV));
5126  }
5127 
5128  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5129  // unfolded offsets. LSR assumes they both live next to their uses.
5130  if (!Ops.empty()) {
5131  Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5132  Ops.clear();
5133  Ops.push_back(SE.getUnknown(FullV));
5134  }
5135 
5136  // Expand the immediate portion.
5137  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
5138  if (Offset != 0) {
5139  if (LU.Kind == LSRUse::ICmpZero) {
5140  // The other interesting way of "folding" with an ICmpZero is to use a
5141  // negated immediate.
5142  if (!ICmpScaledV)
5143  ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
5144  else {
5145  Ops.push_back(SE.getUnknown(ICmpScaledV));
5146  ICmpScaledV = ConstantInt::get(IntTy, Offset);
5147  }
5148  } else {
5149  // Just add the immediate values. These again are expected to be matched
5150  // as part of the address.
5151  Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
5152  }
5153  }
5154 
5155  // Expand the unfolded offset portion.
5156  int64_t UnfoldedOffset = F.UnfoldedOffset;
5157  if (UnfoldedOffset != 0) {
5158  // Just add the immediate values.
5160  UnfoldedOffset)));
5161  }
5162 
5163  // Emit instructions summing all the operands.
5164  const SCEV *FullS = Ops.empty() ?
5165  SE.getConstant(IntTy, 0) :
5166  SE.getAddExpr(Ops);
5167  Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5168 
5169  // We're done expanding now, so reset the rewriter.
5170  Rewriter.clearPostInc();
5171 
5172  // An ICmpZero Formula represents an ICmp which we're handling as a
5173  // comparison against zero. Now that we've expanded an expression for that
5174  // form, update the ICmp's other operand.
5175  if (LU.Kind == LSRUse::ICmpZero) {
5176  ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5177  DeadInsts.emplace_back(CI->getOperand(1));
5178  assert(!F.BaseGV && "ICmp does not support folding a global value and "
5179  "a scale at the same time!");
5180  if (F.Scale == -1) {
5181  if (ICmpScaledV->getType() != OpTy) {
5182  Instruction *Cast =
5183  CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
5184  OpTy, false),
5185  ICmpScaledV, OpTy, "tmp", CI);
5186  ICmpScaledV = Cast;
5187  }
5188  CI->setOperand(1, ICmpScaledV);
5189  } else {
5190  // A scale of 1 means that the scale has been expanded as part of the
5191  // base regs.
5192  assert((F.Scale == 0 || F.Scale == 1) &&
5193  "ICmp does not support folding a global value and "
5194  "a scale at the same time!");
5196  -(uint64_t)Offset);
5197  if (C->getType() != OpTy)
5199  OpTy, false),
5200  C, OpTy);
5201 
5202  CI->setOperand(1, C);
5203  }
5204  }
5205 
5206  return FullV;
5207 }
5208 
5209 /// Helper for Rewrite. PHI nodes are special because the use of their operands
5210 /// effectively happens in their predecessor blocks, so the expression may need
5211 /// to be expanded in multiple places.
5212 void LSRInstance::RewriteForPHI(
5213  PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5214  SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5216  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5217  if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5218  BasicBlock *BB = PN->getIncomingBlock(i);
5219 
5220  // If this is a critical edge, split the edge so that we do not insert
5221  // the code on all predecessor/successor paths. We do this unless this
5222  // is the canonical backedge for this loop, which complicates post-inc
5223  // users.
5224  if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5225  !isa<IndirectBrInst>(BB->getTerminator()) &&
5226  !isa<CatchSwitchInst>(BB->getTerminator())) {
5227  BasicBlock *Parent = PN->getParent();
5228  Loop *PNLoop = LI.getLoopFor(Parent);
5229  if (!PNLoop || Parent != PNLoop->getHeader()) {
5230  // Split the critical edge.
5231  BasicBlock *NewBB = nullptr;
5232  if (!Parent->isLandingPad()) {
5233  NewBB = SplitCriticalEdge(BB, Parent,
5235  .setMergeIdenticalEdges()
5236  .setDontDeleteUselessPHIs());
5237  } else {
5239  SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
5240  NewBB = NewBBs[0];
5241  }
5242  // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5243  // phi predecessors are identical. The simple thing to do is skip
5244  // splitting in this case rather than complicate the API.
5245  if (NewBB) {
5246  // If PN is outside of the loop and BB is in the loop, we want to
5247  // move the block to be immediately before the PHI block, not
5248  // immediately after BB.
5249  if (L->contains(BB) && !L->contains(PN))
5250  NewBB->moveBefore(PN->getParent());
5251 
5252  // Splitting the edge can reduce the number of PHI entries we have.
5253  e = PN->getNumIncomingValues();
5254  BB = NewBB;
5255  i = PN->getBasicBlockIndex(BB);
5256  }
5257  }
5258  }
5259 
5260  std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5261  Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5262  if (!Pair.second)
5263  PN->setIncomingValue(i, Pair.first->second);
5264  else {
5265  Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
5266  Rewriter, DeadInsts);
5267 
5268  // If this is reuse-by-noop-cast, insert the noop cast.
5269  Type *OpTy = LF.OperandValToReplace->getType();
5270  if (FullV->getType() != OpTy)
5271  FullV =
5273  OpTy, false),
5274  FullV, LF.OperandValToReplace->getType(),
5275  "tmp", BB->getTerminator());
5276 
5277  PN->setIncomingValue(i, FullV);
5278  Pair.first->second = FullV;
5279  }
5280  }
5281 }
5282 
5283 /// Emit instructions for the leading candidate expression for this LSRUse (this
5284 /// is called "expanding"), and update the UserInst to reference the newly
5285 /// expanded value.
5286 void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5287  const Formula &F, SCEVExpander &Rewriter,
5288  SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5289  // First, find an insertion point that dominates UserInst. For PHI nodes,
5290  // find the nearest block which dominates all the relevant uses.
5291  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5292  RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
5293  } else {
5294  Value *FullV =
5295  Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
5296 
5297  // If this is reuse-by-noop-cast, insert the noop cast.
5298  Type *OpTy = LF.OperandValToReplace->getType();
5299  if (FullV->getType() != OpTy) {
5300  Instruction *Cast =
5301  CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
5302  FullV, OpTy, "tmp", LF.UserInst);
5303  FullV = Cast;
5304  }
5305 
5306  // Update the user. ICmpZero is handled specially here (for now) because
5307  // Expand may have updated one of the operands of the icmp already, and
5308  // its new value may happen to be equal to LF.OperandValToReplace, in
5309  // which case doing replaceUsesOfWith leads to replacing both operands
5310  // with the same value. TODO: Reorganize this.
5311  if (LU.Kind == LSRUse::ICmpZero)
5312  LF.UserInst->setOperand(0, FullV);
5313  else
5314  LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
5315  }
5316 
5317  DeadInsts.emplace_back(LF.OperandValToReplace);
5318 }
5319 
5320 /// Rewrite all the fixup locations with new values, following the chosen
5321 /// solution.
5322 void LSRInstance::ImplementSolution(
5323  const SmallVectorImpl<const Formula *> &Solution) {
5324  // Keep track of instructions we may have made dead, so that
5325  // we can remove them after we are done working.
5327 
5329  "lsr");
5330 #ifndef NDEBUG
5331  Rewriter.setDebugType(DEBUG_TYPE);
5332 #endif
5333  Rewriter.disableCanonicalMode();
5334  Rewriter.enableLSRMode();
5335  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
5336 
5337  // Mark phi nodes that terminate chains so the expander tries to reuse them.
5338  for (const IVChain &Chain : IVChainVec) {
5339  if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
5340  Rewriter.setChainedPhi(PN);
5341  }
5342 
5343  // Expand the new value definitions and update the users.
5344  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
5345  for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
5346  Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
5347  Changed = true;
5348  }
5349 
5350  for (const IVChain &Chain : IVChainVec) {
5351  GenerateIVChain(Chain, Rewriter, DeadInsts);
5352  Changed = true;
5353  }
5354  // Clean up after ourselves. This must be done before deleting any
5355  // instructions.
5356  Rewriter.clear();
5357 
5358  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
5359 }
5360 
5361 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
5362  DominatorTree &DT, LoopInfo &LI,
5363  const TargetTransformInfo &TTI)
5364  : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
5365  // If LoopSimplify form is not available, stay out of trouble.
5366  if (!L->isLoopSimplifyForm())
5367  return;
5368 
5369  // If there's no interesting work to be done, bail early.
5370  if (IU.empty()) return;
5371 
5372  // If there's too much analysis to be done, bail early. We won't be able to
5373  // model the problem anyway.
5374  unsigned NumUsers = 0;
5375  for (const IVStrideUse &U : IU) {
5376  if (++NumUsers > MaxIVUsers) {
5377  (void)U;
5378  LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
5379  << "\n");
5380  return;
5381  }
5382  // Bail out if we have a PHI on an EHPad that gets a value from a
5383  // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
5384  // no good place to stick any instructions.
5385  if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
5386  auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
5387  if (isa<FuncletPadInst>(FirstNonPHI) ||
5388  isa<CatchSwitchInst>(FirstNonPHI))
5389  for (BasicBlock *PredBB : PN->blocks())
5390  if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
5391  return;
5392  }
5393  }
5394 
5395 #ifndef NDEBUG
5396  // All dominating loops must have preheaders, or SCEVExpander may not be able
5397  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
5398  //
5399  // IVUsers analysis should only create users that are dominated by simple loop
5400  // headers. Since this loop should dominate all of its users, its user list
5401  // should be empty if this loop itself is not within a simple loop nest.
5402  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
5403  Rung; Rung = Rung->getIDom()) {
5404  BasicBlock *BB = Rung->getBlock();
5405  const Loop *DomLoop = LI.getLoopFor(BB);
5406  if (DomLoop && DomLoop->getHeader() == BB) {
5407  assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
5408  }
5409  }
5410 #endif // DEBUG
5411 
5412  LLVM_DEBUG(dbgs() << "\nLSR on loop ";
5413  L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
5414  dbgs() << ":\n");
5415 
5416  // First, perform some low-level loop optimizations.
5417  OptimizeShadowIV();
5418  OptimizeLoopTermCond();
5419 
5420  // If loop preparation eliminates all interesting IV users, bail.
5421  if (IU.empty()) return;
5422 
5423  // Skip nested loops until we can model them better with formulae.
5424  if (!L->empty()) {
5425  LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
5426  return;
5427  }
5428 
5429  // Start collecting data and preparing for the solver.
5430  CollectChains();
5431  CollectInterestingTypesAndFactors();
5432  CollectFixupsAndInitialFormulae();
5433  CollectLoopInvariantFixupsAndFormulae();
5434 
5435  if (Uses.empty())
5436  return;
5437 
5438  LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
5439  print_uses(dbgs()));
5440 
5441  // Now use the reuse data to generate a bunch of interesting ways
5442  // to formulate the values needed for the uses.
5443  GenerateAllReuseFormulae();
5444 
5445  FilterOutUndesirableDedicatedRegisters();
5446  NarrowSearchSpaceUsingHeuristics();
5447 
5449  Solve(Solution);
5450 
5451  // Release memory that is no longer needed.
5452  Factors.clear();
5453  Types.clear();
5454  RegUses.clear();
5455 
5456  if (Solution.empty())
5457  return;
5458 
5459 #ifndef NDEBUG
5460  // Formulae should be legal.
5461  for (const LSRUse &LU : Uses) {
5462  for (const Formula &F : LU.Formulae)
5463  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
5464  F) && "Illegal formula generated!");
5465  };
5466 #endif
5467 
5468  // Now that we've decided what we want, make it so.
5469  ImplementSolution(Solution);
5470 }
5471 
5472 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5473 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
5474  if (Factors.empty() && Types.empty()) return;
5475 
5476  OS << "LSR has identified the following interesting factors and types: ";
5477  bool First = true;
5478 
5479  for (int64_t Factor : Factors) {
5480  if (!First) OS << ", ";
5481  First = false;
5482  OS << '*' << Factor;
5483  }
5484 
5485  for (Type *Ty : Types) {
5486  if (!First) OS << ", ";
5487  First = false;
5488  OS << '(' << *Ty << ')';
5489  }
5490  OS << '\n';
5491 }
5492 
5493 void LSRInstance::print_fixups(raw_ostream &OS) const {
5494  OS << "LSR is examining the following fixup sites:\n";
5495  for (const LSRUse &LU : Uses)
5496  for (const LSRFixup &LF : LU.Fixups) {
5497  dbgs() << " ";
5498  LF.print(OS);
5499  OS << '\n';
5500  }
5501 }
5502 
5503 void LSRInstance::print_uses(raw_ostream &OS) const {
5504  OS << "LSR is examining the following uses:\n";
5505  for (const LSRUse &LU : Uses) {
5506  dbgs() << " ";
5507  LU.print(OS);
5508  OS << '\n';
5509  for (const Formula &F : LU.Formulae) {
5510  OS << " ";
5511  F.print(OS);
5512  OS << '\n';
5513  }
5514  }
5515 }
5516 
5517 void LSRInstance::print(raw_ostream &OS) const {
5518  print_factors_and_types(OS);
5519  print_fixups(OS);
5520  print_uses(OS);
5521 }
5522 
5523 LLVM_DUMP_METHOD void LSRInstance::dump() const {
5524  print(errs()); errs() << '\n';
5525 }
5526 #endif
5527 
5528 namespace {
5529 
5530 class LoopStrengthReduce : public LoopPass {
5531 public:
5532  static char ID; // Pass ID, replacement for typeid
5533 
5534  LoopStrengthReduce();
5535 
5536 private:
5537  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
5538  void getAnalysisUsage(AnalysisUsage &AU) const override;
5539 };
5540 
5541 } // end anonymous namespace
5542 
5543 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
5545 }
5546 
5547 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
5548  // We split critical edges, so we change the CFG. However, we do update
5549  // many analyses if they are around.
5551 
5559  // Requiring LoopSimplify a second time here prevents IVUsers from running
5560  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
5565 }
5566 
5568  DominatorTree &DT, LoopInfo &LI,
5569  const TargetTransformInfo &TTI) {
5570  bool Changed = false;
5571 
5572  // Run the main LSR transformation.
5573  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
5574 
5575  // Remove any extra phis created by processing inner loops.
5576  Changed |= DeleteDeadPHIs(L->getHeader());
5577  if (EnablePhiElim && L->isLoopSimplifyForm()) {
5579  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
5580  SCEVExpander Rewriter(SE, DL, "lsr");
5581 #ifndef NDEBUG
5582  Rewriter.setDebugType(DEBUG_TYPE);
5583 #endif
5584  unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
5585  if (numFolded) {
5586  Changed = true;
5588  DeleteDeadPHIs(L->getHeader());
5589  }
5590  }
5591  return Changed;
5592 }
5593 
5594 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
5595  if (skipLoop(L))
5596  return false;
5597 
5598  auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
5599  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
5600  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5601  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
5602  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
5603  *L->getHeader()->getParent());
5604  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
5605 }
5606 
5609  LPMUpdater &) {
5610  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
5611  AR.DT, AR.LI, AR.TTI))
5612  return PreservedAnalyses::all();
5613 
5615 }
5616 
5617 char LoopStrengthReduce::ID = 0;
5618 
5619 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
5620  "Loop Strength Reduction", false, false)
5626 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
5627 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
5628  "Loop Strength Reduction", false, false)
5629 
5630 Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V...
Definition: Constants.cpp:1267
uint64_t CallInst * C
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:259
Definition: Any.h:27
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
iterator_range< use_iterator > uses()
Definition: Value.h:355
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:225
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:250
DiagnosticInfoOptimizationBase::Argument NV
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:328
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
Pass * createLoopStrengthReducePass()
This is a &#39;bitvector&#39; (really, a variable-sized bit array), optimized for the case when the array is ...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:770
const SCEV * getConstant(ConstantInt *V)
This class represents lattice values for constants.
Definition: AllocatorList.h:24
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:78
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1591
an instruction that atomically checks whether a specified value is in a memory location, and, if it is, stores a new value there.
Definition: Instructions.h:529
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Normalize S to be post-increment for all loops present in Loops.
Implements a dense probed hash-table based set.
Definition: DenseSet.h:250
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:92
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
The main scalar evolution driver.
bool isZero() const
Return true if the expression is a constant zero.
unsigned Reg
unsigned getNumberOfRegisters(bool Vector) const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:174
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:630
unsigned less than
Definition: InstrTypes.h:671
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block...
void initializeLoopStrengthReducePass(PassRegistry &)
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
void setDebugType(const char *s)
NodeT * findNearestCommonDominator(NodeT *A, NodeT *B) const
findNearestCommonDominator - Find nearest common dominator basic block for basic block A and B...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
static void dump(StringRef Title, SpillInfo const &Spills)
Definition: CoroFrame.cpp:299
static bool isCanonical(const MDString *S)
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:168
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:692
Value * getCondition() const
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:138
iv Induction Variable Users
Definition: IVUsers.cpp:52
This defines the Use class.
void reserve(size_type N)
Definition: SmallVector.h:376
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_PASS_END(LoopStrengthReduce
This is the base class for unary cast operator classes.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
SI optimize exec mask operations pre RA
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
#define DEBUG_TYPE
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:134
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
Hexagon Hardware Loops
This class represents the LLVM &#39;select&#39; instruction.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE, etc.
Definition: InstrTypes.h:745
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371
Option class for critical edge splitting.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:690
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
void clearPostInc()
Disable all post-inc expansion.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:285
#define LLVM_DUMP_METHOD
Definition: Compiler.h:74
This node represents multiplication of some number of SCEVs.
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:784
This file implements a class to represent arbitrary precision integral constant values and operations...
const APInt & getAPInt() const
BlockT * getHeader() const
Definition: LoopInfo.h:100
ConstantInt * getValue() const
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:267
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:201
Instruction * clone() const
Create a copy of &#39;this&#39; instruction that is identical in all ways except the following: ...
Key
PAL metadata keys.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it&#39;s free to truncate a value of type Ty1 to type Ty2.
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:142
void clear()
Definition: SmallSet.h:219
static bool isEqual(const Function &Caller, const Function &Callee)
This node represents a polynomial recurrence on the trip count of the specified loop.
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
Examine each PHI in the given block and delete it if it is dead.
iterator_range< const_set_bits_iterator > set_bits() const
This header provides classes for managing per-loop analyses.
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:138
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions())
If this edge is a critical edge, insert a new node to split the critical edge.
AnalysisUsage & addPreservedID(const void *ID)
An instruction for storing to memory.
Definition: Instructions.h:321
op_iterator op_begin() const
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:151
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition: Type.cpp:134
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
This class represents a truncation of integer types.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
Value * getOperand(unsigned i) const
Definition: User.h:170
Class to represent pointers.
Definition: DerivedTypes.h:467
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1252
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
static bool DeleteTriviallyDeadInstructions(SmallVectorImpl< WeakTrackingVH > &DeadInsts)
If any of the instructions in the specified set are trivially dead, delete them and see if this makes...
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock *> Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock *> &NewBBs, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
bool empty() const
Definition: IVUsers.h:149
NodeT * getBlock() const
static bool isCompatibleIVType(Value *LVal, Value *RVal)
Return true if we allow an IV chain to include both types.
#define P(N)
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
const SCEV * getOperand(unsigned i) const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
bool isAllOnesValue() const
Determine if all bits are set.
Definition: APInt.h:396
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:190
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:149
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:51
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
bool isStrictlyPositive() const
Determine if this APInt Value is positive.
Definition: APInt.h:391
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:120
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
PointerIntPair - This class implements a pair of a pointer and small integer.
This class represents a binary unsigned division operation.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
void setInsertPoint(Instruction *IP)
Set the current insertion point.
Conditional or Unconditional Branch instruction.
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction *> &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
DomTreeNodeBase * getIDom() const
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:42
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
This file contains the declarations for the subclasses of Constant, which represent the different fla...
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:224
static const unsigned UnknownAddressSpace
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
const SCEV * getAddExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target...
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:443
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:232
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:161
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg)
This instruction compares its operands according to the predicate given to the constructor.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:646
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value, and mutate S to point to a new SCEV with that value excluded.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
op_range operands()
Definition: User.h:238
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
iterator_range< block_iterator > blocks()
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:181
auto find_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1214
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:319
void getExitingBlocks(SmallVectorImpl< BlockT *> &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:35
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV *> &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
iterator erase(const_iterator CI)
Definition: SmallVector.h:445
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
static unsigned getIncomingValueNumForOperand(unsigned i)
size_t size() const
Definition: SmallVector.h:53
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1207
const SCEV * getMulExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
Definition: AsmWriter.cpp:4225
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
unsigned first
char & LoopSimplifyID
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1116
void setChainedPhi(PHINode *PN)
PowerPC TLS Dynamic Call Fixup
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:468
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
unsigned getSCEVType() const
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:298
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
Type * getType() const
Return the LLVM type of this SCEV expression.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C1.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:130
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:299
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:249
Module.h This file contains the declarations for the Module class.
constexpr size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition: STLExtras.h:1044
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const DataFlowGraph & G
Definition: RDFGraph.cpp:211
signed less than
Definition: InstrTypes.h:675
void setIVIncInsertPos(const Loop *L, Instruction *Pos)
Set the current IV increment loop and position.
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper...
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:381
CHAIN = SC CHAIN, Imm128 - System call.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:622
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.cpp:636
size_t size() const
Returns the number of bits in this bitvector.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:685
unsigned getNumIncomingValues() const
Return the number of incoming edges.
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
void setOperand(unsigned i, Value *Val)
Definition: User.h:175
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:539
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV *> &Good, SmallVectorImpl< const SCEV *> &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
bool isTrueWhenEqual() const
This is just a convenience.
Definition: InstrTypes.h:841
signed less or equal
Definition: InstrTypes.h:676
Class for arbitrary precision integers.
Definition: APInt.h:70
This node represents an addition of some number of SCEVs.
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
This class represents a signed maximum selection.
typename SuperClass::iterator iterator
Definition: SmallVector.h:327
iterator_range< user_iterator > users()
Definition: Value.h:400
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
This class uses information about analyze scalars to rewrite expressions in canonical form...
static Constant * getCast(unsigned ops, Constant *C, Type *Ty, bool OnlyIfReduced=false)
Convenience function for getting a Cast operation.
Definition: Constants.cpp:1530
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:479
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:212
loop Loop Strength Reduction
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:478
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:394
Virtual Register Rewriter
Definition: VirtRegMap.cpp:222
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:1969
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass&#39;s ...
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:721
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:193
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:311
iterator begin() const
Definition: SmallPtrSet.h:397
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:652
This class represents an analyzed expression in the program.
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:189
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1683
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:57
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression&#39;s "base", or NULL for any constant.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
This class represents a cast unsigned integer to floating point.
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
This class represents an unsigned maximum selection.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:325
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:92
bool isUnconditional() const
iterator end() const
Definition: SmallPtrSet.h:402
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:418
loop reduce
unsigned Insns
TODO: Some of these could be merged.
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
bool empty() const
Definition: LoopInfo.h:146
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT, SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetTransformInfo *TTI=nullptr)
replace congruent phis with their most canonical representative.
const unsigned Kind
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
This class represents a cast from signed integer to floating point.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned getMinSignedBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1552
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction has no side ef...
Definition: Local.cpp:349
LLVM Value Representation.
Definition: Value.h:73
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
A vector that has set insertion semantics.
Definition: SetVector.h:41
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:87
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:38
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
void disableCanonicalMode()
Disable the behavior of expanding expressions in canonical form rather than in a more literal form...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:573
const SCEV * getUnknown(Value *V)
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:970
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:413
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:260
This pass exposes codegen information to IR-level passes.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1967
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:157
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop. ...
This node is a base class providing common functionality for n&#39;ary operators.
This header defines various interfaces for pass management in LLVM.
void setIncomingValue(unsigned i, Value *V)
#define LLVM_DEBUG(X)
Definition: Debug.h:123
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
void setPostInc(const PostIncLoopSet &L)
Enable post-inc expansion for addrecs referring to the given loops.
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI)
Information about a load/store intrinsic defined by the target.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.cpp:121
bool use_empty() const
Definition: Value.h:323
size_type count() const
Returns the number of bits which are set.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV *> &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
const BasicBlock * getParent() const
Definition: Instruction.h:67
This class represents a constant integer value.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
void resize(size_type N)
Definition: SmallVector.h:351
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1245