LLVM  8.0.1
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass merges loads/stores to/from sequential memory addresses into vector
11 // loads/stores. Although there's nothing GPU-specific in here, this pass is
12 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 //
14 // (For simplicity below we talk about loads only, but everything also applies
15 // to stores.)
16 //
17 // This pass is intended to be run late in the pipeline, after other
18 // vectorization opportunities have been exploited. So the assumption here is
19 // that immediately following our new vector load we'll need to extract out the
20 // individual elements of the load, so we can operate on them individually.
21 //
22 // On CPUs this transformation is usually not beneficial, because extracting the
23 // elements of a vector register is expensive on most architectures. It's
24 // usually better just to load each element individually into its own scalar
25 // register.
26 //
27 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
28 // "vector load" loads directly into a series of scalar registers. In effect,
29 // extracting the elements of the vector is free. It's therefore always
30 // beneficial to vectorize a sequence of loads on these architectures.
31 //
32 // Vectorizing (perhaps a better name might be "coalescing") loads can have
33 // large performance impacts on GPU kernels, and opportunities for vectorizing
34 // are common in GPU code. This pass tries very hard to find such
35 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 //
37 // Some CPU architectures, such as ARM, have instructions that load into
38 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
39 // could use this pass (with some modifications), but currently it implements
40 // its own pass to do something similar to what we do here.
41 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
59 #include "llvm/IR/Attributes.h"
60 #include "llvm/IR/BasicBlock.h"
61 #include "llvm/IR/Constants.h"
62 #include "llvm/IR/DataLayout.h"
63 #include "llvm/IR/DerivedTypes.h"
64 #include "llvm/IR/Dominators.h"
65 #include "llvm/IR/Function.h"
66 #include "llvm/IR/IRBuilder.h"
67 #include "llvm/IR/InstrTypes.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/IntrinsicInst.h"
71 #include "llvm/IR/Module.h"
72 #include "llvm/IR/Type.h"
73 #include "llvm/IR/User.h"
74 #include "llvm/IR/Value.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
83 #include <algorithm>
84 #include <cassert>
85 #include <cstdlib>
86 #include <tuple>
87 #include <utility>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "load-store-vectorizer"
92 
93 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
94 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
95 
96 // FIXME: Assuming stack alignment of 4 is always good enough
97 static const unsigned StackAdjustedAlignment = 4;
98 
99 namespace {
100 
101 /// ChainID is an arbitrary token that is allowed to be different only for the
102 /// accesses that are guaranteed to be considered non-consecutive by
103 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
104 /// together and reducing the number of instructions the main search operates on
105 /// at a time, i.e. this is to reduce compile time and nothing else as the main
106 /// search has O(n^2) time complexity. The underlying type of ChainID should not
107 /// be relied upon.
108 using ChainID = const Value *;
109 using InstrList = SmallVector<Instruction *, 8>;
110 using InstrListMap = MapVector<ChainID, InstrList>;
111 
112 class Vectorizer {
113  Function &F;
114  AliasAnalysis &AA;
115  DominatorTree &DT;
116  ScalarEvolution &SE;
117  TargetTransformInfo &TTI;
118  const DataLayout &DL;
119  IRBuilder<> Builder;
120 
121 public:
122  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
124  : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
125  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
126 
127  bool run();
128 
129 private:
130  unsigned getPointerAddressSpace(Value *I);
131 
132  unsigned getAlignment(LoadInst *LI) const {
133  unsigned Align = LI->getAlignment();
134  if (Align != 0)
135  return Align;
136 
137  return DL.getABITypeAlignment(LI->getType());
138  }
139 
140  unsigned getAlignment(StoreInst *SI) const {
141  unsigned Align = SI->getAlignment();
142  if (Align != 0)
143  return Align;
144 
145  return DL.getABITypeAlignment(SI->getValueOperand()->getType());
146  }
147 
148  static const unsigned MaxDepth = 3;
149 
150  bool isConsecutiveAccess(Value *A, Value *B);
151  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
152  unsigned Depth = 0) const;
153  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
154  unsigned Depth) const;
155  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
156  unsigned Depth) const;
157 
158  /// After vectorization, reorder the instructions that I depends on
159  /// (the instructions defining its operands), to ensure they dominate I.
160  void reorder(Instruction *I);
161 
162  /// Returns the first and the last instructions in Chain.
163  std::pair<BasicBlock::iterator, BasicBlock::iterator>
164  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
165 
166  /// Erases the original instructions after vectorizing.
167  void eraseInstructions(ArrayRef<Instruction *> Chain);
168 
169  /// "Legalize" the vector type that would be produced by combining \p
170  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
171  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
172  /// expected to have more than 4 elements.
173  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
174  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
175 
176  /// Finds the largest prefix of Chain that's vectorizable, checking for
177  /// intervening instructions which may affect the memory accessed by the
178  /// instructions within Chain.
179  ///
180  /// The elements of \p Chain must be all loads or all stores and must be in
181  /// address order.
182  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
183 
184  /// Collects load and store instructions to vectorize.
185  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
186 
187  /// Processes the collected instructions, the \p Map. The values of \p Map
188  /// should be all loads or all stores.
189  bool vectorizeChains(InstrListMap &Map);
190 
191  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
192  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
193 
194  /// Vectorizes the load instructions in Chain.
195  bool
196  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
197  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
198 
199  /// Vectorizes the store instructions in Chain.
200  bool
201  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
202  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
203 
204  /// Check if this load/store access is misaligned accesses.
205  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
206  unsigned Alignment);
207 };
208 
209 class LoadStoreVectorizerLegacyPass : public FunctionPass {
210 public:
211  static char ID;
212 
213  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
215  }
216 
217  bool runOnFunction(Function &F) override;
218 
219  StringRef getPassName() const override {
220  return "GPU Load and Store Vectorizer";
221  }
222 
223  void getAnalysisUsage(AnalysisUsage &AU) const override {
228  AU.setPreservesCFG();
229  }
230 };
231 
232 } // end anonymous namespace
233 
235 
236 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
237  "Vectorize load and Store instructions", false, false)
243 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
244  "Vectorize load and store instructions", false, false)
245 
247  return new LoadStoreVectorizerLegacyPass();
248 }
249 
251  // Don't vectorize when the attribute NoImplicitFloat is used.
252  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
253  return false;
254 
255  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
256  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
257  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
258  TargetTransformInfo &TTI =
259  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
260 
261  Vectorizer V(F, AA, DT, SE, TTI);
262  return V.run();
263 }
264 
266  // Don't vectorize when the attribute NoImplicitFloat is used.
268  return PreservedAnalyses::all();
269 
270  AliasAnalysis &AA = AM.getResult<AAManager>(F);
274 
275  Vectorizer V(F, AA, DT, SE, TTI);
276  bool Changed = V.run();
278  PA.preserveSet<CFGAnalyses>();
279  return Changed ? PA : PreservedAnalyses::all();
280 }
281 
282 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
283 // vectors of Instructions.
285  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
286  propagateMetadata(I, VL);
287 }
288 
289 // Vectorizer Implementation
290 bool Vectorizer::run() {
291  bool Changed = false;
292 
293  // Scan the blocks in the function in post order.
294  for (BasicBlock *BB : post_order(&F)) {
295  InstrListMap LoadRefs, StoreRefs;
296  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
297  Changed |= vectorizeChains(LoadRefs);
298  Changed |= vectorizeChains(StoreRefs);
299  }
300 
301  return Changed;
302 }
303 
304 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
305  if (LoadInst *L = dyn_cast<LoadInst>(I))
306  return L->getPointerAddressSpace();
307  if (StoreInst *S = dyn_cast<StoreInst>(I))
308  return S->getPointerAddressSpace();
309  return -1;
310 }
311 
312 // FIXME: Merge with llvm::isConsecutiveAccess
316  unsigned ASA = getPointerAddressSpace(A);
317  unsigned ASB = getPointerAddressSpace(B);
318 
319  // Check that the address spaces match and that the pointers are valid.
320  if (!PtrA || !PtrB || (ASA != ASB))
321  return false;
322 
323  // Make sure that A and B are different pointers of the same size type.
324  Type *PtrATy = PtrA->getType()->getPointerElementType();
325  Type *PtrBTy = PtrB->getType()->getPointerElementType();
326  if (PtrA == PtrB ||
327  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
328  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
329  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
330  DL.getTypeStoreSize(PtrBTy->getScalarType()))
331  return false;
332 
333  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
334  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
335 
336  return areConsecutivePointers(PtrA, PtrB, Size);
337 }
338 
339 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
340  const APInt &PtrDelta,
341  unsigned Depth) const {
342  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
343  APInt OffsetA(PtrBitWidth, 0);
344  APInt OffsetB(PtrBitWidth, 0);
345  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
346  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
347 
348  APInt OffsetDelta = OffsetB - OffsetA;
349 
350  // Check if they are based on the same pointer. That makes the offsets
351  // sufficient.
352  if (PtrA == PtrB)
353  return OffsetDelta == PtrDelta;
354 
355  // Compute the necessary base pointer delta to have the necessary final delta
356  // equal to the pointer delta requested.
357  APInt BaseDelta = PtrDelta - OffsetDelta;
358 
359  // Compute the distance with SCEV between the base pointers.
360  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
361  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
362  const SCEV *C = SE.getConstant(BaseDelta);
363  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
364  if (X == PtrSCEVB)
365  return true;
366 
367  // The above check will not catch the cases where one of the pointers is
368  // factorized but the other one is not, such as (C + (S * (A + B))) vs
369  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
370  // and getting the simplified difference.
371  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
372  if (C == Dist)
373  return true;
374 
375  // Sometimes even this doesn't work, because SCEV can't always see through
376  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
377  // things the hard way.
378  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
379 }
380 
381 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
382  APInt PtrDelta,
383  unsigned Depth) const {
384  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
385  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
386  if (!GEPA || !GEPB)
387  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
388 
389  // Look through GEPs after checking they're the same except for the last
390  // index.
391  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
392  GEPA->getPointerOperand() != GEPB->getPointerOperand())
393  return false;
394  gep_type_iterator GTIA = gep_type_begin(GEPA);
395  gep_type_iterator GTIB = gep_type_begin(GEPB);
396  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
397  if (GTIA.getOperand() != GTIB.getOperand())
398  return false;
399  ++GTIA;
400  ++GTIB;
401  }
402 
403  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
404  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
405  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
406  OpA->getType() != OpB->getType())
407  return false;
408 
409  if (PtrDelta.isNegative()) {
410  if (PtrDelta.isMinSignedValue())
411  return false;
412  PtrDelta.negate();
413  std::swap(OpA, OpB);
414  }
415  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
416  if (PtrDelta.urem(Stride) != 0)
417  return false;
418  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
419  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
420 
421  // Only look through a ZExt/SExt.
422  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
423  return false;
424 
425  bool Signed = isa<SExtInst>(OpA);
426 
427  // At this point A could be a function parameter, i.e. not an instruction
428  Value *ValA = OpA->getOperand(0);
429  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
430  if (!OpB || ValA->getType() != OpB->getType())
431  return false;
432 
433  // Now we need to prove that adding IdxDiff to ValA won't overflow.
434  bool Safe = false;
435  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
436  // ValA, we're okay.
437  if (OpB->getOpcode() == Instruction::Add &&
438  isa<ConstantInt>(OpB->getOperand(1)) &&
439  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
440  if (Signed)
441  Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
442  else
443  Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
444  }
445 
446  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
447 
448  // Second attempt:
449  // If all set bits of IdxDiff or any higher order bit other than the sign bit
450  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
451  // overflow of any sort.
452  if (!Safe) {
453  OpA = dyn_cast<Instruction>(ValA);
454  if (!OpA)
455  return false;
456  KnownBits Known(BitWidth);
457  computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
458  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
459  if (Signed)
460  BitsAllowedToBeSet.clearBit(BitWidth - 1);
461  if (BitsAllowedToBeSet.ult(IdxDiff))
462  return false;
463  }
464 
465  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
466  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
467  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
468  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
469  return X == OffsetSCEVB;
470 }
471 
472 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
473  const APInt &PtrDelta,
474  unsigned Depth) const {
475  if (Depth++ == MaxDepth)
476  return false;
477 
478  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
479  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
480  return SelectA->getCondition() == SelectB->getCondition() &&
481  areConsecutivePointers(SelectA->getTrueValue(),
482  SelectB->getTrueValue(), PtrDelta, Depth) &&
483  areConsecutivePointers(SelectA->getFalseValue(),
484  SelectB->getFalseValue(), PtrDelta, Depth);
485  }
486  }
487  return false;
488 }
489 
490 void Vectorizer::reorder(Instruction *I) {
491  OrderedBasicBlock OBB(I->getParent());
492  SmallPtrSet<Instruction *, 16> InstructionsToMove;
494 
495  Worklist.push_back(I);
496  while (!Worklist.empty()) {
497  Instruction *IW = Worklist.pop_back_val();
498  int NumOperands = IW->getNumOperands();
499  for (int i = 0; i < NumOperands; i++) {
501  if (!IM || IM->getOpcode() == Instruction::PHI)
502  continue;
503 
504  // If IM is in another BB, no need to move it, because this pass only
505  // vectorizes instructions within one BB.
506  if (IM->getParent() != I->getParent())
507  continue;
508 
509  if (!OBB.dominates(IM, I)) {
510  InstructionsToMove.insert(IM);
511  Worklist.push_back(IM);
512  }
513  }
514  }
515 
516  // All instructions to move should follow I. Start from I, not from begin().
517  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
518  ++BBI) {
519  if (!InstructionsToMove.count(&*BBI))
520  continue;
521  Instruction *IM = &*BBI;
522  --BBI;
523  IM->removeFromParent();
524  IM->insertBefore(I);
525  }
526 }
527 
528 std::pair<BasicBlock::iterator, BasicBlock::iterator>
529 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
530  Instruction *C0 = Chain[0];
531  BasicBlock::iterator FirstInstr = C0->getIterator();
532  BasicBlock::iterator LastInstr = C0->getIterator();
533 
534  BasicBlock *BB = C0->getParent();
535  unsigned NumFound = 0;
536  for (Instruction &I : *BB) {
537  if (!is_contained(Chain, &I))
538  continue;
539 
540  ++NumFound;
541  if (NumFound == 1) {
542  FirstInstr = I.getIterator();
543  }
544  if (NumFound == Chain.size()) {
545  LastInstr = I.getIterator();
546  break;
547  }
548  }
549 
550  // Range is [first, last).
551  return std::make_pair(FirstInstr, ++LastInstr);
552 }
553 
554 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
556  for (Instruction *I : Chain) {
557  Value *PtrOperand = getLoadStorePointerOperand(I);
558  assert(PtrOperand && "Instruction must have a pointer operand.");
559  Instrs.push_back(I);
560  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
561  Instrs.push_back(GEP);
562  }
563 
564  // Erase instructions.
565  for (Instruction *I : Instrs)
566  if (I->use_empty())
567  I->eraseFromParent();
568 }
569 
570 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
571 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
572  unsigned ElementSizeBits) {
573  unsigned ElementSizeBytes = ElementSizeBits / 8;
574  unsigned SizeBytes = ElementSizeBytes * Chain.size();
575  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
576  if (NumLeft == Chain.size()) {
577  if ((NumLeft & 1) == 0)
578  NumLeft /= 2; // Split even in half
579  else
580  --NumLeft; // Split off last element
581  } else if (NumLeft == 0)
582  NumLeft = 1;
583  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
584 }
585 
587 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
588  // These are in BB order, unlike Chain, which is in address order.
589  SmallVector<Instruction *, 16> MemoryInstrs;
590  SmallVector<Instruction *, 16> ChainInstrs;
591 
592  bool IsLoadChain = isa<LoadInst>(Chain[0]);
593  LLVM_DEBUG({
594  for (Instruction *I : Chain) {
595  if (IsLoadChain)
596  assert(isa<LoadInst>(I) &&
597  "All elements of Chain must be loads, or all must be stores.");
598  else
599  assert(isa<StoreInst>(I) &&
600  "All elements of Chain must be loads, or all must be stores.");
601  }
602  });
603 
604  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
605  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
606  if (!is_contained(Chain, &I))
607  MemoryInstrs.push_back(&I);
608  else
609  ChainInstrs.push_back(&I);
610  } else if (isa<IntrinsicInst>(&I) &&
611  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
613  // Ignore llvm.sideeffect calls.
614  } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
615  LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
616  << '\n');
617  break;
618  } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
619  LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
620  << '\n');
621  break;
622  }
623  }
624 
625  OrderedBasicBlock OBB(Chain[0]->getParent());
626 
627  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
628  unsigned ChainInstrIdx = 0;
629  Instruction *BarrierMemoryInstr = nullptr;
630 
631  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
632  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
633 
634  // If a barrier memory instruction was found, chain instructions that follow
635  // will not be added to the valid prefix.
636  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
637  break;
638 
639  // Check (in BB order) if any instruction prevents ChainInstr from being
640  // vectorized. Find and store the first such "conflicting" instruction.
641  for (Instruction *MemInstr : MemoryInstrs) {
642  // If a barrier memory instruction was found, do not check past it.
643  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
644  break;
645 
646  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
647  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
648  if (MemLoad && ChainLoad)
649  continue;
650 
651  // We can ignore the alias if the we have a load store pair and the load
652  // is known to be invariant. The load cannot be clobbered by the store.
653  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
655  };
656 
657  // We can ignore the alias as long as the load comes before the store,
658  // because that means we won't be moving the load past the store to
659  // vectorize it (the vectorized load is inserted at the location of the
660  // first load in the chain).
661  if (isa<StoreInst>(MemInstr) && ChainLoad &&
662  (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
663  continue;
664 
665  // Same case, but in reverse.
666  if (MemLoad && isa<StoreInst>(ChainInstr) &&
667  (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
668  continue;
669 
670  if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
671  MemoryLocation::get(ChainInstr))) {
672  LLVM_DEBUG({
673  dbgs() << "LSV: Found alias:\n"
674  " Aliasing instruction and pointer:\n"
675  << " " << *MemInstr << '\n'
676  << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
677  << " Aliased instruction and pointer:\n"
678  << " " << *ChainInstr << '\n'
679  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
680  });
681  // Save this aliasing memory instruction as a barrier, but allow other
682  // instructions that precede the barrier to be vectorized with this one.
683  BarrierMemoryInstr = MemInstr;
684  break;
685  }
686  }
687  // Continue the search only for store chains, since vectorizing stores that
688  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
689  // up to an aliasing store, but should not pull loads from further down in
690  // the basic block.
691  if (IsLoadChain && BarrierMemoryInstr) {
692  // The BarrierMemoryInstr is a store that precedes ChainInstr.
693  assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
694  break;
695  }
696  }
697 
698  // Find the largest prefix of Chain whose elements are all in
699  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
700  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
701  // order.)
702  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
703  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
704  unsigned ChainIdx = 0;
705  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
706  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
707  break;
708  }
709  return Chain.slice(0, ChainIdx);
710 }
711 
712 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
713  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
714  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
715  // The select's themselves are distinct instructions even if they share the
716  // same condition and evaluate to consecutive pointers for true and false
717  // values of the condition. Therefore using the select's themselves for
718  // grouping instructions would put consecutive accesses into different lists
719  // and they won't be even checked for being consecutive, and won't be
720  // vectorized.
721  return Sel->getCondition();
722  }
723  return ObjPtr;
724 }
725 
726 std::pair<InstrListMap, InstrListMap>
727 Vectorizer::collectInstructions(BasicBlock *BB) {
728  InstrListMap LoadRefs;
729  InstrListMap StoreRefs;
730 
731  for (Instruction &I : *BB) {
732  if (!I.mayReadOrWriteMemory())
733  continue;
734 
735  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
736  if (!LI->isSimple())
737  continue;
738 
739  // Skip if it's not legal.
740  if (!TTI.isLegalToVectorizeLoad(LI))
741  continue;
742 
743  Type *Ty = LI->getType();
745  continue;
746 
747  // Skip weird non-byte sizes. They probably aren't worth the effort of
748  // handling correctly.
749  unsigned TySize = DL.getTypeSizeInBits(Ty);
750  if ((TySize % 8) != 0)
751  continue;
752 
753  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
754  // functions are currently using an integer type for the vectorized
755  // load/store, and does not support casting between the integer type and a
756  // vector of pointers (e.g. i64 to <2 x i16*>)
757  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
758  continue;
759 
760  Value *Ptr = LI->getPointerOperand();
761  unsigned AS = Ptr->getType()->getPointerAddressSpace();
762  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
763 
764  unsigned VF = VecRegSize / TySize;
765  VectorType *VecTy = dyn_cast<VectorType>(Ty);
766 
767  // No point in looking at these if they're too big to vectorize.
768  if (TySize > VecRegSize / 2 ||
769  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
770  continue;
771 
772  // Make sure all the users of a vector are constant-index extracts.
773  if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
775  return EEI && isa<ConstantInt>(EEI->getOperand(1));
776  }))
777  continue;
778 
779  // Save the load locations.
780  const ChainID ID = getChainID(Ptr, DL);
781  LoadRefs[ID].push_back(LI);
782  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
783  if (!SI->isSimple())
784  continue;
785 
786  // Skip if it's not legal.
787  if (!TTI.isLegalToVectorizeStore(SI))
788  continue;
789 
790  Type *Ty = SI->getValueOperand()->getType();
792  continue;
793 
794  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
795  // functions are currently using an integer type for the vectorized
796  // load/store, and does not support casting between the integer type and a
797  // vector of pointers (e.g. i64 to <2 x i16*>)
798  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
799  continue;
800 
801  // Skip weird non-byte sizes. They probably aren't worth the effort of
802  // handling correctly.
803  unsigned TySize = DL.getTypeSizeInBits(Ty);
804  if ((TySize % 8) != 0)
805  continue;
806 
807  Value *Ptr = SI->getPointerOperand();
808  unsigned AS = Ptr->getType()->getPointerAddressSpace();
809  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
810 
811  unsigned VF = VecRegSize / TySize;
812  VectorType *VecTy = dyn_cast<VectorType>(Ty);
813 
814  // No point in looking at these if they're too big to vectorize.
815  if (TySize > VecRegSize / 2 ||
816  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
817  continue;
818 
819  if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
821  return EEI && isa<ConstantInt>(EEI->getOperand(1));
822  }))
823  continue;
824 
825  // Save store location.
826  const ChainID ID = getChainID(Ptr, DL);
827  StoreRefs[ID].push_back(SI);
828  }
829  }
830 
831  return {LoadRefs, StoreRefs};
832 }
833 
834 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
835  bool Changed = false;
836 
837  for (const std::pair<ChainID, InstrList> &Chain : Map) {
838  unsigned Size = Chain.second.size();
839  if (Size < 2)
840  continue;
841 
842  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
843 
844  // Process the stores in chunks of 64.
845  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
846  unsigned Len = std::min<unsigned>(CE - CI, 64);
847  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
848  Changed |= vectorizeInstructions(Chunk);
849  }
850  }
851 
852  return Changed;
853 }
854 
855 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
856  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
857  << " instructions.\n");
858  SmallVector<int, 16> Heads, Tails;
859  int ConsecutiveChain[64];
860 
861  // Do a quadratic search on all of the given loads/stores and find all of the
862  // pairs of loads/stores that follow each other.
863  for (int i = 0, e = Instrs.size(); i < e; ++i) {
864  ConsecutiveChain[i] = -1;
865  for (int j = e - 1; j >= 0; --j) {
866  if (i == j)
867  continue;
868 
869  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
870  if (ConsecutiveChain[i] != -1) {
871  int CurDistance = std::abs(ConsecutiveChain[i] - i);
872  int NewDistance = std::abs(ConsecutiveChain[i] - j);
873  if (j < i || NewDistance > CurDistance)
874  continue; // Should not insert.
875  }
876 
877  Tails.push_back(j);
878  Heads.push_back(i);
879  ConsecutiveChain[i] = j;
880  }
881  }
882  }
883 
884  bool Changed = false;
885  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
886 
887  for (int Head : Heads) {
888  if (InstructionsProcessed.count(Instrs[Head]))
889  continue;
890  bool LongerChainExists = false;
891  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
892  if (Head == Tails[TIt] &&
893  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
894  LongerChainExists = true;
895  break;
896  }
897  if (LongerChainExists)
898  continue;
899 
900  // We found an instr that starts a chain. Now follow the chain and try to
901  // vectorize it.
903  int I = Head;
904  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
905  if (InstructionsProcessed.count(Instrs[I]))
906  break;
907 
908  Operands.push_back(Instrs[I]);
909  I = ConsecutiveChain[I];
910  }
911 
912  bool Vectorized = false;
913  if (isa<LoadInst>(*Operands.begin()))
914  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
915  else
916  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
917 
918  Changed |= Vectorized;
919  }
920 
921  return Changed;
922 }
923 
924 bool Vectorizer::vectorizeStoreChain(
926  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
927  StoreInst *S0 = cast<StoreInst>(Chain[0]);
928 
929  // If the vector has an int element, default to int for the whole store.
930  Type *StoreTy;
931  for (Instruction *I : Chain) {
932  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
933  if (StoreTy->isIntOrIntVectorTy())
934  break;
935 
936  if (StoreTy->isPtrOrPtrVectorTy()) {
937  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
938  DL.getTypeSizeInBits(StoreTy));
939  break;
940  }
941  }
942 
943  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
944  unsigned AS = S0->getPointerAddressSpace();
945  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
946  unsigned VF = VecRegSize / Sz;
947  unsigned ChainSize = Chain.size();
948  unsigned Alignment = getAlignment(S0);
949 
950  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
951  InstructionsProcessed->insert(Chain.begin(), Chain.end());
952  return false;
953  }
954 
955  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
956  if (NewChain.empty()) {
957  // No vectorization possible.
958  InstructionsProcessed->insert(Chain.begin(), Chain.end());
959  return false;
960  }
961  if (NewChain.size() == 1) {
962  // Failed after the first instruction. Discard it and try the smaller chain.
963  InstructionsProcessed->insert(NewChain.front());
964  return false;
965  }
966 
967  // Update Chain to the valid vectorizable subchain.
968  Chain = NewChain;
969  ChainSize = Chain.size();
970 
971  // Check if it's legal to vectorize this chain. If not, split the chain and
972  // try again.
973  unsigned EltSzInBytes = Sz / 8;
974  unsigned SzInBytes = EltSzInBytes * ChainSize;
975 
976  VectorType *VecTy;
977  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
978  if (VecStoreTy)
979  VecTy = VectorType::get(StoreTy->getScalarType(),
980  Chain.size() * VecStoreTy->getNumElements());
981  else
982  VecTy = VectorType::get(StoreTy, Chain.size());
983 
984  // If it's more than the max vector size or the target has a better
985  // vector factor, break it into two pieces.
986  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
987  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
988  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
989  " Creating two separate arrays.\n");
990  return vectorizeStoreChain(Chain.slice(0, TargetVF),
991  InstructionsProcessed) |
992  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
993  }
994 
995  LLVM_DEBUG({
996  dbgs() << "LSV: Stores to vectorize:\n";
997  for (Instruction *I : Chain)
998  dbgs() << " " << *I << "\n";
999  });
1000 
1001  // We won't try again to vectorize the elements of the chain, regardless of
1002  // whether we succeed below.
1003  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1004 
1005  // If the store is going to be misaligned, don't vectorize it.
1006  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1007  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1008  auto Chains = splitOddVectorElts(Chain, Sz);
1009  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1010  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1011  }
1012 
1013  unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1015  DL, S0, nullptr, &DT);
1016  if (NewAlign != 0)
1017  Alignment = NewAlign;
1018  }
1019 
1020  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1021  auto Chains = splitOddVectorElts(Chain, Sz);
1022  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1023  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1024  }
1025 
1026  BasicBlock::iterator First, Last;
1027  std::tie(First, Last) = getBoundaryInstrs(Chain);
1028  Builder.SetInsertPoint(&*Last);
1029 
1030  Value *Vec = UndefValue::get(VecTy);
1031 
1032  if (VecStoreTy) {
1033  unsigned VecWidth = VecStoreTy->getNumElements();
1034  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1035  StoreInst *Store = cast<StoreInst>(Chain[I]);
1036  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1037  unsigned NewIdx = J + I * VecWidth;
1038  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1039  Builder.getInt32(J));
1040  if (Extract->getType() != StoreTy->getScalarType())
1041  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1042 
1043  Value *Insert =
1044  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1045  Vec = Insert;
1046  }
1047  }
1048  } else {
1049  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1050  StoreInst *Store = cast<StoreInst>(Chain[I]);
1051  Value *Extract = Store->getValueOperand();
1052  if (Extract->getType() != StoreTy->getScalarType())
1053  Extract =
1054  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1055 
1056  Value *Insert =
1057  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1058  Vec = Insert;
1059  }
1060  }
1061 
1062  StoreInst *SI = Builder.CreateAlignedStore(
1063  Vec,
1064  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1065  Alignment);
1066  propagateMetadata(SI, Chain);
1067 
1068  eraseInstructions(Chain);
1069  ++NumVectorInstructions;
1070  NumScalarsVectorized += Chain.size();
1071  return true;
1072 }
1073 
1074 bool Vectorizer::vectorizeLoadChain(
1076  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1077  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1078 
1079  // If the vector has an int element, default to int for the whole load.
1080  Type *LoadTy;
1081  for (const auto &V : Chain) {
1082  LoadTy = cast<LoadInst>(V)->getType();
1083  if (LoadTy->isIntOrIntVectorTy())
1084  break;
1085 
1086  if (LoadTy->isPtrOrPtrVectorTy()) {
1087  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1088  DL.getTypeSizeInBits(LoadTy));
1089  break;
1090  }
1091  }
1092 
1093  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1094  unsigned AS = L0->getPointerAddressSpace();
1095  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1096  unsigned VF = VecRegSize / Sz;
1097  unsigned ChainSize = Chain.size();
1098  unsigned Alignment = getAlignment(L0);
1099 
1100  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1101  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1102  return false;
1103  }
1104 
1105  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1106  if (NewChain.empty()) {
1107  // No vectorization possible.
1108  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1109  return false;
1110  }
1111  if (NewChain.size() == 1) {
1112  // Failed after the first instruction. Discard it and try the smaller chain.
1113  InstructionsProcessed->insert(NewChain.front());
1114  return false;
1115  }
1116 
1117  // Update Chain to the valid vectorizable subchain.
1118  Chain = NewChain;
1119  ChainSize = Chain.size();
1120 
1121  // Check if it's legal to vectorize this chain. If not, split the chain and
1122  // try again.
1123  unsigned EltSzInBytes = Sz / 8;
1124  unsigned SzInBytes = EltSzInBytes * ChainSize;
1125  VectorType *VecTy;
1126  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1127  if (VecLoadTy)
1128  VecTy = VectorType::get(LoadTy->getScalarType(),
1129  Chain.size() * VecLoadTy->getNumElements());
1130  else
1131  VecTy = VectorType::get(LoadTy, Chain.size());
1132 
1133  // If it's more than the max vector size or the target has a better
1134  // vector factor, break it into two pieces.
1135  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1136  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1137  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1138  " Creating two separate arrays.\n");
1139  return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1140  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1141  }
1142 
1143  // We won't try again to vectorize the elements of the chain, regardless of
1144  // whether we succeed below.
1145  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1146 
1147  // If the load is going to be misaligned, don't vectorize it.
1148  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1149  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1150  auto Chains = splitOddVectorElts(Chain, Sz);
1151  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1152  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1153  }
1154 
1155  unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1157  DL, L0, nullptr, &DT);
1158  if (NewAlign != 0)
1159  Alignment = NewAlign;
1160 
1161  Alignment = NewAlign;
1162  }
1163 
1164  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1165  auto Chains = splitOddVectorElts(Chain, Sz);
1166  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1167  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1168  }
1169 
1170  LLVM_DEBUG({
1171  dbgs() << "LSV: Loads to vectorize:\n";
1172  for (Instruction *I : Chain)
1173  I->dump();
1174  });
1175 
1176  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1177  // Last may have changed since then, but the value of First won't have. If it
1178  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1179  BasicBlock::iterator First, Last;
1180  std::tie(First, Last) = getBoundaryInstrs(Chain);
1181  Builder.SetInsertPoint(&*First);
1182 
1183  Value *Bitcast =
1184  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1185  LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);
1186  propagateMetadata(LI, Chain);
1187 
1188  if (VecLoadTy) {
1189  SmallVector<Instruction *, 16> InstrsToErase;
1190 
1191  unsigned VecWidth = VecLoadTy->getNumElements();
1192  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1193  for (auto Use : Chain[I]->users()) {
1194  // All users of vector loads are ExtractElement instructions with
1195  // constant indices, otherwise we would have bailed before now.
1196  Instruction *UI = cast<Instruction>(Use);
1197  unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1198  unsigned NewIdx = Idx + I * VecWidth;
1199  Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1200  UI->getName());
1201  if (V->getType() != UI->getType())
1202  V = Builder.CreateBitCast(V, UI->getType());
1203 
1204  // Replace the old instruction.
1205  UI->replaceAllUsesWith(V);
1206  InstrsToErase.push_back(UI);
1207  }
1208  }
1209 
1210  // Bitcast might not be an Instruction, if the value being loaded is a
1211  // constant. In that case, no need to reorder anything.
1212  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1213  reorder(BitcastInst);
1214 
1215  for (auto I : InstrsToErase)
1216  I->eraseFromParent();
1217  } else {
1218  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1219  Value *CV = Chain[I];
1220  Value *V =
1221  Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1222  if (V->getType() != CV->getType()) {
1223  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1224  }
1225 
1226  // Replace the old instruction.
1227  CV->replaceAllUsesWith(V);
1228  }
1229 
1230  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1231  reorder(BitcastInst);
1232  }
1233 
1234  eraseInstructions(Chain);
1235 
1236  ++NumVectorInstructions;
1237  NumScalarsVectorized += Chain.size();
1238  return true;
1239 }
1240 
1241 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1242  unsigned Alignment) {
1243  if (Alignment % SzInBytes == 0)
1244  return false;
1245 
1246  bool Fast = false;
1247  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1248  SzInBytes * 8, AddressSpace,
1249  Alignment, &Fast);
1250  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1251  << " and fast? " << Fast << "\n";);
1252  return !Allows || !Fast;
1253 }
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:152
uint64_t CallInst * C
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Value * getValueOperand()
Definition: Instructions.h:410
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1184
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:770
This class represents lattice values for constants.
Definition: AllocatorList.h:24
iterator begin() const
Definition: ArrayRef.h:137
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
The main scalar evolution driver.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:858
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1520
bool dominates(const Instruction *A, const Instruction *B)
Find out whether A dominates B, meaning whether A comes before B in BB.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
Analysis pass providing the TargetTransformInfo.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:321
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:38
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:811
STATISTIC(NumFunctions, "Total number of functions")
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:231
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:168
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1239
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1509
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4298
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static uint32_t getAlignment(const MCSectionCOFF &Sec)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Type * getPointerElementType() const
Definition: Type.h:376
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
APInt zextOrSelf(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:892
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743
This file contains the simple types necessary to represent the attributes associated with functions a...
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
This file implements a class to represent arbitrary precision integral constant values and operations...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:419
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:43
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1462
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:621
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:221
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
Value * getLoadStorePointerOperand(Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
An instruction for storing to memory.
Definition: Instructions.h:321
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
Value * getOperand(unsigned i) const
Definition: User.h:170
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:854
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:364
static bool runOnFunction(Function &F, bool PostInlining)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1613
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:74
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static const unsigned StackAdjustedAlignment
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1185
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
#define DEBUG_TYPE
This file contains the declarations for the subclasses of Constant, which represent the different fla...
A manager for alias analyses.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
bool mayThrow() const
Return true if this instruction may throw an exception.
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:443
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Value * getPointerOperand()
Definition: Instructions.h:285
iterator_range< po_iterator< T > > post_order(const T &G)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1415
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
size_t size() const
Definition: SmallVector.h:53
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
const unsigned MaxDepth
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:227
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:192
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
static unsigned getIntrinsicID(const SDNode *N)
Legacy wrapper pass to provide the SCEVAAResult object.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:271
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:130
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
Module.h This file contains the declarations for the Module class.
AddressSpace
Definition: NVPTXBaseInfo.h:22
iterator end() const
Definition: ArrayRef.h:138
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:730
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:381
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:180
void negate()
Negate this APInt in place.
Definition: APInt.h:1493
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
Class to represent vector types.
Definition: DerivedTypes.h:393
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
Accumulate offsets from stripInBoundsConstantOffsets().
Definition: Value.cpp:547
Class for arbitrary precision integers.
Definition: APInt.h:70
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:179
Represents analyses that only rely on functions&#39; control flow.
Definition: PassManager.h:115
void removeFromParent()
This method unlinks &#39;this&#39; from the containing basic block, but does not delete it.
Definition: Instruction.cpp:64
Analysis pass that exposes the ScalarEvolution for a function.
iv users
Definition: IVUsers.cpp:52
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:241
This class represents an analyzed expression in the program.
static ChainID getChainID(const Value *Ptr, const DataLayout &DL)
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
This file provides utility analysis objects describing memory locations.
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:190
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
This instruction extracts a single (scalar) element from a VectorType value.
uint32_t Size
Definition: Profile.cpp:47
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:366
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:291
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
LLVM Value Representation.
Definition: Value.h:73
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
static const Function * getParent(const Value *V)
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
inst_range instructions(Function *F)
Definition: InstIterator.h:134
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:260
This pass exposes codegen information to IR-level passes.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:123
Value * getPointerOperand()
Definition: Instructions.h:413
bool use_empty() const
Definition: Value.h:323
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:144
const BasicBlock * getParent() const
Definition: Instruction.h:67
gep_type_iterator gep_type_begin(const User *GEP)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1245
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:522